<!DOCTYPE html><html lang="zh-CN" data-theme="light"><head><meta charset="UTF-8"><meta http-equiv="X-UA-Compatible" content="IE=edge"><meta name="viewport" content="width=device-width, initial-scale=1.0,viewport-fit=cover"><title>【2023-10-25 组会分享】大语言模型综述 | David 的博客</title><meta name="author" content="David"><meta name="copyright" content="David"><meta name="format-detection" content="telephone=no"><meta name="theme-color" content="#ffffff"><meta name="description" content="大语言模型综述A Survey of Large Language Models 文章基本信息如下  文章名字：A Survey of Large Language Models 发表时间：2023年3月 发表期刊：arxiv(预印版) 发表机构：中国人民大学 - AI Box小组 (ruc.edu.cn)   0 摘要 (Abstract) 时代背景：  自从20世纪50年代图灵测试被提出以来，">
<meta property="og:type" content="article">
<meta property="og:title" content="【2023-10-25 组会分享】大语言模型综述">
<meta property="og:url" content="https://blog.david-deng.cn/2023/10/25/1%202023-10-25%20%E7%BB%84%E4%BC%9A%E5%88%86%E4%BA%AB/index.html">
<meta property="og:site_name" content="David 的博客">
<meta property="og:description" content="大语言模型综述A Survey of Large Language Models 文章基本信息如下  文章名字：A Survey of Large Language Models 发表时间：2023年3月 发表期刊：arxiv(预印版) 发表机构：中国人民大学 - AI Box小组 (ruc.edu.cn)   0 摘要 (Abstract) 时代背景：  自从20世纪50年代图灵测试被提出以来，">
<meta property="og:locale" content="zh_CN">
<meta property="og:image" content="https://jsd.012700.xyz/gh/jerryc127/CDN/img/material-2.png">
<meta property="article:published_time" content="2023-10-24T16:00:00.000Z">
<meta property="article:modified_time" content="2023-11-16T13:34:22.000Z">
<meta property="article:author" content="David">
<meta property="article:tag" content="AI">
<meta property="article:tag" content="NLP">
<meta name="twitter:card" content="summary">
<meta name="twitter:image" content="https://jsd.012700.xyz/gh/jerryc127/CDN/img/material-2.png"><link rel="shortcut icon" href="/img/favicon.svg"><link rel="canonical" href="https://blog.david-deng.cn/2023/10/25/1%202023-10-25%20%E7%BB%84%E4%BC%9A%E5%88%86%E4%BA%AB/index.html"><link rel="preconnect" href="//cdn.jsdelivr.net"/><link rel="preconnect" href="//busuanzi.ibruce.info"/><link rel="stylesheet" href="/css/index.css"><link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/@fortawesome/fontawesome-free/css/all.min.css"><link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/node-snackbar/dist/snackbar.min.css" media="print" onload="this.media='all'"><script>
    (() => {
      
    const saveToLocal = {
      set: (key, value, ttl) => {
        if (!ttl) return
        const expiry = Date.now() + ttl * 86400000
        localStorage.setItem(key, JSON.stringify({ value, expiry }))
      },
      get: key => {
        const itemStr = localStorage.getItem(key)
        if (!itemStr) return undefined
        const { value, expiry } = JSON.parse(itemStr)
        if (Date.now() > expiry) {
          localStorage.removeItem(key)
          return undefined
        }
        return value
      }
    }

    window.btf = {
      saveToLocal,
      getScript: (url, attr = {}) => new Promise((resolve, reject) => {
        const script = document.createElement('script')
        script.src = url
        script.async = true
        Object.entries(attr).forEach(([key, val]) => script.setAttribute(key, val))
        script.onload = script.onreadystatechange = () => {
          if (!script.readyState || /loaded|complete/.test(script.readyState)) resolve()
        }
        script.onerror = reject
        document.head.appendChild(script)
      }),
      getCSS: (url, id) => new Promise((resolve, reject) => {
        const link = document.createElement('link')
        link.rel = 'stylesheet'
        link.href = url
        if (id) link.id = id
        link.onload = link.onreadystatechange = () => {
          if (!link.readyState || /loaded|complete/.test(link.readyState)) resolve()
        }
        link.onerror = reject
        document.head.appendChild(link)
      }),
      addGlobalFn: (key, fn, name = false, parent = window) => {
        if (!false && key.startsWith('pjax')) return
        const globalFn = parent.globalFn || {}
        globalFn[key] = globalFn[key] || {}
        globalFn[key][name || Object.keys(globalFn[key]).length] = fn
        parent.globalFn = globalFn
      }
    }
  
      
      const activateDarkMode = () => {
        document.documentElement.setAttribute('data-theme', 'dark')
        if (document.querySelector('meta[name="theme-color"]') !== null) {
          document.querySelector('meta[name="theme-color"]').setAttribute('content', '#0d0d0d')
        }
      }
      const activateLightMode = () => {
        document.documentElement.setAttribute('data-theme', 'light')
        if (document.querySelector('meta[name="theme-color"]') !== null) {
          document.querySelector('meta[name="theme-color"]').setAttribute('content', '#ffffff')
        }
      }

      btf.activateDarkMode = activateDarkMode
      btf.activateLightMode = activateLightMode

      const theme = saveToLocal.get('theme')
    
          const mediaQueryDark = window.matchMedia('(prefers-color-scheme: dark)')
          const mediaQueryLight = window.matchMedia('(prefers-color-scheme: light)')
          
          if (theme === undefined) {
            if (mediaQueryLight.matches) activateLightMode()
            else if (mediaQueryDark.matches) activateDarkMode()
            else {
              const hour = new Date().getHours()
              const isNight = hour <= 6 || hour >= 18
              isNight ? activateDarkMode() : activateLightMode()
            }
            mediaQueryDark.addEventListener('change', () => {
              if (saveToLocal.get('theme') === undefined) {
                e.matches ? activateDarkMode() : activateLightMode()
              }
            })
          } else {
            theme === 'light' ? activateLightMode() : activateDarkMode()
          }
        
      
      const asideStatus = saveToLocal.get('aside-status')
      if (asideStatus !== undefined) {
        document.documentElement.classList.toggle('hide-aside', asideStatus === 'hide')
      }
    
      
    const detectApple = () => {
      if (/iPad|iPhone|iPod|Macintosh/.test(navigator.userAgent)) {
        document.documentElement.classList.add('apple')
      }
    }
    detectApple()
  
    })()
  </script><script>const GLOBAL_CONFIG = {
  root: '/',
  algolia: undefined,
  localSearch: undefined,
  translate: {"defaultEncoding":2,"translateDelay":0,"msgToTraditionalChinese":"繁","msgToSimplifiedChinese":"简"},
  highlight: {"plugin":"highlight.js","highlightCopy":true,"highlightLang":true,"highlightHeightLimit":false,"highlightFullpage":false,"highlightMacStyle":false},
  copy: {
    success: '复制成功',
    error: '复制失败',
    noSupport: '浏览器不支持'
  },
  relativeDate: {
    homepage: true,
    post: false
  },
  runtime: '',
  dateSuffix: {
    just: '刚刚',
    min: '分钟前',
    hour: '小时前',
    day: '天前',
    month: '个月前'
  },
  copyright: {"limitCount":50,"languages":{"author":"作者: David","link":"链接: ","source":"来源: David 的博客","info":"著作权归作者所有。商业转载请联系作者获得授权，非商业转载请注明出处。"}},
  lightbox: 'null',
  Snackbar: {"chs_to_cht":"已切换为繁体中文","cht_to_chs":"已切换为简体中文","day_to_night":"已切换为深色模式","night_to_day":"已切换为浅色模式","bgLight":"#49b1f5","bgDark":"#121212","position":"bottom-left"},
  infinitegrid: {
    js: 'https://cdn.jsdelivr.net/npm/@egjs/infinitegrid/dist/infinitegrid.min.js',
    buttonText: '加载更多'
  },
  isPhotoFigcaption: false,
  islazyload: false,
  isAnchor: true,
  percent: {
    toc: true,
    rightside: false,
  },
  autoDarkmode: true
}</script><script id="config-diff">var GLOBAL_CONFIG_SITE = {
  title: '【2023-10-25 组会分享】大语言模型综述',
  isPost: true,
  isHome: false,
  isHighlightShrink: false,
  isToc: true,
  isShuoshuo: false
}</script><style type="text/css">#toggle-sidebar {bottom: 80px}</style><meta name="generator" content="Hexo 7.3.0"></head><body><div id="sidebar"><div id="menu-mask"></div><div id="sidebar-menus"><div class="avatar-img text-center"><img src="/img/avatar.png" onerror="onerror=null;src='/img/loading.gif'" alt="avatar"/></div><div class="site-data text-center"><a href="/archives/"><div class="headline">文章</div><div class="length-num">27</div></a><a href="/tags/"><div class="headline">标签</div><div class="length-num">28</div></a><a href="/categories/"><div class="headline">分类</div><div class="length-num">28</div></a></div><div class="menus_items"><div class="menus_item"><a class="site-page" href="/"><i class="fa-fw fas fa-home"></i><span> 首页</span></a></div><div class="menus_item"><span class="site-page group"><i class="fa-fw fas fa-compass"></i><span> 目录</span><i class="fas fa-chevron-down"></i></span><ul class="menus_item_child"><li><a class="site-page child" href="/archives/"><i class="fa-fw fas fa-archive"></i><span> 归档</span></a></li><li><a class="site-page child" href="/tags/"><i class="fa-fw fas fa-tags"></i><span> 标签</span></a></li><li><a class="site-page child" href="/categories/"><i class="fa-fw fas fa-folder-open"></i><span> 分类</span></a></li></ul></div><div class="menus_item"><a class="site-page" href="/link/"><i class="fa-fw fas fa-link"></i><span> 友情链接</span></a></div><div class="menus_item"><a class="site-page" href="/about/"><i class="fa-fw fas fa-heart"></i><span> 关于</span></a></div></div></div></div><div class="post" id="body-wrap"><header class="post-bg" id="page-header" style="background-image: url(https://jsd.012700.xyz/gh/jerryc127/CDN/img/material-2.png);"><nav id="nav"><span id="blog-info"><a class="nav-site-title" href="/"><span class="site-name">David 的博客</span></a><a class="nav-page-title" href="/"><span class="site-name">【2023-10-25 组会分享】大语言模型综述</span></a></span><div id="menus"><div class="menus_items"><div class="menus_item"><a class="site-page" href="/"><i class="fa-fw fas fa-home"></i><span> 首页</span></a></div><div class="menus_item"><span class="site-page group"><i class="fa-fw fas fa-compass"></i><span> 目录</span><i class="fas fa-chevron-down"></i></span><ul class="menus_item_child"><li><a class="site-page child" href="/archives/"><i class="fa-fw fas fa-archive"></i><span> 归档</span></a></li><li><a class="site-page child" href="/tags/"><i class="fa-fw fas fa-tags"></i><span> 标签</span></a></li><li><a class="site-page child" href="/categories/"><i class="fa-fw fas fa-folder-open"></i><span> 分类</span></a></li></ul></div><div class="menus_item"><a class="site-page" href="/link/"><i class="fa-fw fas fa-link"></i><span> 友情链接</span></a></div><div class="menus_item"><a class="site-page" href="/about/"><i class="fa-fw fas fa-heart"></i><span> 关于</span></a></div></div><div id="toggle-menu"><span class="site-page"><i class="fas fa-bars fa-fw"></i></span></div></div></nav><div id="post-info"><h1 class="post-title">【2023-10-25 组会分享】大语言模型综述</h1><div id="post-meta"><div class="meta-firstline"><span class="post-meta-date"><i class="far fa-calendar-alt fa-fw post-meta-icon"></i><span class="post-meta-label">发表于</span><time class="post-meta-date-created" datetime="2023-10-24T16:00:00.000Z" title="发表于 2023-10-25 00:00:00">2023-10-25</time><span class="post-meta-separator">|</span><i class="fas fa-history fa-fw post-meta-icon"></i><span class="post-meta-label">更新于</span><time class="post-meta-date-updated" datetime="2023-11-16T13:34:22.000Z" title="更新于 2023-11-16 21:34:22">2023-11-16</time></span><span class="post-meta-categories"><span class="post-meta-separator">|</span><i class="fas fa-inbox fa-fw post-meta-icon"></i><a class="post-meta-categories" href="/categories/AI/">AI</a><i class="fas fa-angle-right post-meta-separator"></i><i class="fas fa-inbox fa-fw post-meta-icon"></i><a class="post-meta-categories" href="/categories/AI/NLP/">NLP</a></span></div><div class="meta-secondline"><span class="post-meta-separator">|</span><span class="post-meta-wordcount"><i class="far fa-file-word fa-fw post-meta-icon"></i><span class="post-meta-label">总字数:</span><span class="word-count">10.1k</span><span class="post-meta-separator">|</span><i class="far fa-clock fa-fw post-meta-icon"></i><span class="post-meta-label">阅读时长:</span><span>37分钟</span></span><span class="post-meta-separator">|</span><span class="post-meta-pv-cv" id="" data-flag-title=""><i class="far fa-eye fa-fw post-meta-icon"></i><span class="post-meta-label">浏览量:</span><span id="busuanzi_value_page_pv"><i class="fa-solid fa-spinner fa-spin"></i></span></span></div></div></div></header><main class="layout" id="content-inner"><div id="post"><article class="container post-content" id="article-container"><div id="post-outdate-notice" data="{&quot;limitDay&quot;:90,&quot;messagePrev&quot;:&quot;文章距离最近一次更新已经&quot;,&quot;messageNext&quot;:&quot;天，文章的内容可能已经过期。&quot;,&quot;postUpdate&quot;:&quot;2023-11-16 21:34:22&quot;}" hidden></div><h1 id="大语言模型综述A-Survey-of-Large-Language-Models"><a href="#大语言模型综述A-Survey-of-Large-Language-Models" class="headerlink" title="大语言模型综述A Survey of Large Language Models"></a>大语言模型综述<br/><span style='text-align: center; font-size: 16px;'>A Survey of Large Language Models</span ></h1><blockquote>
<p>文章基本信息如下</p>
<ul>
<li>文章名字：A Survey of Large Language Models</li>
<li>发表时间：2023年3月</li>
<li>发表期刊：arxiv(预印版)</li>
<li>发表机构：<a target="_blank" rel="noopener external nofollow noreferrer" href="http://aibox.ruc.edu.cn/">中国人民大学 - AI Box小组 (ruc.edu.cn)</a></li>
</ul>
</blockquote>
<h2 id="0-摘要-Abstract"><a href="#0-摘要-Abstract" class="headerlink" title="0 摘要 (Abstract)"></a>0 摘要 <span style='font-size: 14px;'>(Abstract)</span></h2><blockquote>
<p><strong>时代背景：</strong></p>
<ol>
<li>自从20世纪50年代图灵测试被提出以来，人类一直在探索如何用机器掌握语言智能。</li>
<li>近年来，通过在大规模语料库上对Transformer 模型进行预训练，人们提出了预训练语言模型（Pre-training Language Model, PLM），其在解决各种自然语言处理（Natural Language Processing, NLP）任务方面表现出强大的能力。</li>
<li>近年来，学术界和工业界极大地 推进了针对LLM的研究，其中一个显著的进展是推出了ChatGPT（一种基于LLM开发的强大AI聊天机器人），它引起了社会的广泛 关注。</li>
</ol>
</blockquote>
<blockquote>
<p><strong>研究发现：</strong></p>
<ol>
<li>研究人员发现扩展模型规模可以提高模型能力，因此他们通过将参数增加到更大的 尺寸来进一步研究该效应。</li>
<li>有趣的是，当参数规模超过一定水平时，这些规模更大的语言模型的性能不仅得到了显著提升，而且还表现出 一些小规模语言模型（例如BERT）所不具备的特殊能力（例如上下文学习）。</li>
</ol>
</blockquote>
<blockquote>
<p><strong>文章讲了什么？</strong></p>
<ol>
<li>通过介绍大语言模型(large Language Model)的背景、主要发现和主流技术来回顾近年来的进展。</li>
<li>主要关注大语言模型(large Language Model)以下四个主要方面：①预训练，②适配微调，③使用，④能力评估</li>
<li>总结了开发LLM的可用资源，并讨论了LLM现有的问题和未来的发展方向。</li>
<li>提供了关于 LLM 的最新文献综述。</li>
</ol>
</blockquote>
<h2 id="1-大语言模型近年来的发展"><a href="#1-大语言模型近年来的发展" class="headerlink" title="1 大语言模型近年来的发展"></a>1 大语言模型近年来的发展</h2><h3 id="1-1-大语言模型的背景"><a href="#1-1-大语言模型的背景" class="headerlink" title="1.1 大语言模型的背景"></a>1.1 大语言模型的背景</h3><blockquote>
<p>自从20世纪50年代图灵测试被提出以来，人类一直在探索如何用机器掌握语言智能。机器除非配备了强大的人工智能算法，否则不能自然地掌握以人类语言形式理解和交流的能力。实现让机器像人类一样阅读、写作和交流的目标， 一直是一个长期的研究挑战。作为一种主要的语言理解和生成方法，语言建模在过去的二十年中得到了广泛的研究，并从<strong>统计语言模型</strong>逐步发展为<strong>神经语言模型</strong>。</p>
</blockquote>
<h4 id="1-1-1-语言建模的四个主要发展阶段："><a href="#1-1-1-语言建模的四个主要发展阶段：" class="headerlink" title="1.1.1 语言建模的四个主要发展阶段："></a>1.1.1 语言建模的四个主要发展阶段：</h4><ol>
<li><strong>统计语言模型(SLM)</strong><ul>
<li>20世纪90年代，学术界对于统计语言模型(SLM)的研究开始兴起。</li>
<li>统计语言模型(SLM)基于统计学习方法开发，其基本思想是基于马尔可夫假设建立词预测模型，例如根据最近的上下文预测下一个词。</li>
<li>具有固定上下文长度n的统计语言模型(SLM)也称为n元语言模型，例如 bi-gram 和 tri-gram 语言模型。<ul>
<li>bi-gram语言模型：当前词出现的概率只与上一个词出现的概率相关</li>
<li>tri-gram语言模型：当前词出现的概率只与上两个词出现的概率相关</li>
</ul>
</li>
<li>统计语言模型(SLM)已被广泛应用于提高信息检索（IR）和自然语言处理（NLP）的任务性能。</li>
</ul>
</li>
<li><strong>神经语言模型(NLM)</strong><ol>
<li>神经语言模型(NLM)通过神经网络， 如循环神经网络（RNN），来描述单词序列的概率。</li>
<li>word2vec提出了构建一个简化的浅层神经网络来学习分布式单词表示的方法，这些表示在各种 NLP 任务中被证明非常有效。</li>
<li>这些研究开创了将语言模型用于表示学习（超越词序列建模）的应用，对NLP领域产生了 重要影响。</li>
</ol>
</li>
<li><strong>预训练语言模型(PLM)</strong><ol>
<li>ELMo被提出来通过预训练一个双向LSTM（bi-LSTM）网络来捕捉上下文感知的词表示，然后根据 特定的下游任务微调bi-LSTM网络。</li>
<li>基于自注意力机制的高度并行化Transformer架构，BERT作为双向语言模型，在大规模无标签语料库上使用专门设计的预训练任务。这些预训练的上下文感知词表示作为通用语义特征非常有效，其极大地提高了NLP任务的性能。</li>
</ol>
</li>
<li><strong>大语言模型(LLM)</strong><ol>
<li>扩展PLM（例如 扩展模型大小或数据大小）通常会提高下游任务的模型性能。许多研究通过训练越来越大的PLM （例如175B参数的GPT-3，40B参数的PaLM和2200B参数的GPT4）来探索性能极限。</li>
<li>尽管扩展主要在模型大小方面进行（使用类似的架构和预训练任务），但这些大规模的PLM与较小的PLM （例如0.3B参数的BERT和1.5B参数的GPT-2）表现出不同的行为，并在解决一系列复杂任务中展示了惊人的能力（称 为涌现能力）。例如，GPT-3可以通过上下文学习（in-context learning, ICL）来解决小样本任务，而GPT-2则表现不佳。</li>
</ol>
</li>
</ol>
<h4 id="1-1-2-研究热度统计："><a href="#1-1-2-研究热度统计：" class="headerlink" title="1.1.2 研究热度统计："></a>1.1.2 研究热度统计：</h4><img src="https://cdn.jsdelivr.net/gh/David-deng-01/images/blog/arxiv_llms.png" alt="arxiv_llms" style="zoom:100%;" />

<blockquote>
<ul>
<li>(a)图显示了包含关键词<code>Language Model</code>的arXiv文章累计数量统计（自2018年6月起）</li>
<li>(b)图显示了包含关键词<code>Large Language Model</code>的arXiv文章累计数量统计（自2019年10月起）</li>
</ul>
<p>通过按月份查询标题或摘要中的关键词，使用精确匹配计算统计数据。(b)图中可以看出，ChatGPT发布后急剧在标题或摘要中包含“大型语言模型”的已发表arXiv论文的平均数量增加，从每天0.40篇增加到每天8.58篇</p>
</blockquote>
<h3 id="1-2-主要发现"><a href="#1-2-主要发现" class="headerlink" title="1.2 主要发现"></a>1.2 主要发现</h3><blockquote>
<p>问题一：多大的模型才能算是大语言模型呢？</p>
</blockquote>
<ul>
<li>Bing AI：The definition of “large” is fuzzy, but “large” has been used to describe BERT (110M parameters) as well as PaLM 2 (up to 340B parameters. Parameters are the weights the model learned during training, used to predict the next token in the sequence.</li>
<li>wikipedia：大语言模型 (英语：large language model，LLM) 是一种语言模型，由具有许多参数（通常数十亿个权重或更多）的人工神经网络组成，使用自监督学习或半监督学习对大量未标记文本进行训练。大型语言模型在2018年左右出现，并在各种任务中表现出色。</li>
<li>**本综述的观点：**大语言模型是指包含数千亿（或更多）参数的Transformer语言模型，这些模型是在大规模文本数据上进行训练的，例如GPT-3，PaLM，Galactica 和 LLaMA。</li>
<li><strong>我的理解：</strong><ul>
<li>通过上面的回答我们可以看出，现在的研究中并没有对于多大的模型才能算作大语言模型有一个确切的定义。通常大语言模型一般拥有数十亿或者更多的参数。</li>
<li>对于本综述的观点我并不是非常的认可。因为我们知道 LLaMA 2 拥有三个版本，参数量分别是7B、13B和70B，没有达到综述中的千亿参数的规模，但是 LLaMA 2 还是被大家公认为是一个预训练的大语言模型。</li>
</ul>
</li>
</ul>
<blockquote>
<p>问题二：LLM和PLM有什么区别呢？</p>
</blockquote>
<ol>
<li>LLM表现出一些令人惊讶的涌现能力，这些能力可能在以前较小的PLM中没有观察到。这些能力是LM在复杂任务上表现的关键，它使得人工智能算法具有前所未有的强大和有效性。</li>
<li>LLM将彻底改变人类开发和使用人工智能算法的方式。与小型PLM不同，访问LLM的主要方法是通过提示接口（例如GPT-4API）。人们必须了解LLM 的工作原理，并以LLM能够遵循的方式形式化他们的任务。</li>
<li>LLM的发展不再明确区分研究和工程。训练LLM需要在大规模数据处理和分布式并行训练方面具有丰富的实践经验。为了开发出有能力的LLM，研究人员必须解决复杂的工程问题，他们需要与工程师合作或成为工程师。</li>
</ol>
<p><strong>我的理解：</strong></p>
<ul>
<li><p>大语言模型(LLM)指的是的参数量非常庞大的语言模型，而预训练语言模型(PLM)指的是在一些通用的语料库上预先训练过的语言模型。</p>
</li>
<li><p>例如：前面提到的“ELMo被提出来通过预训练一个双向LSTM（bi-LSTM）”，它是预训练语言模型(PLM)但是不能算作大语言模型(LLM)，因为它的参数量没有达到大语言模型的标准(虽然这个标准很模糊)，但是它在一些语料库中进行过训练，所以能够称为预训练语言模型(PLM)。</p>
</li>
<li><p>所以，大语言模型(LLM)和预训练语言模型(PLM)之间存在交集，但不是被包含与包含的关系</p>
<img src="https://cdn.jsdelivr.net/gh/David-deng-01/images/blog/image-20231022212643140.png" alt="image-20231022212643140" style="zoom:100%;" /></li>
</ul>
<h3 id="1-3-大语言模型的扩展法则"><a href="#1-3-大语言模型的扩展法则" class="headerlink" title="1.3 大语言模型的扩展法则"></a>1.3 大语言模型的扩展法则</h3><h4 id="1-3-1-KM扩展法则"><a href="#1-3-1-KM扩展法则" class="headerlink" title="1.3.1 KM扩展法则"></a>1.3.1 KM扩展法则</h4><ol>
<li><p>Kaplan 等人（OpenAI 团队）于2023年首次提出了神经语言模型的性能与模型规模（N）、数据集规模（D）和训练计算量（C）之间的幂律关系。</p>
</li>
<li><p>计算公式如下<br>$$<br>L(N) &#x3D; \bigg(\frac{N_c}{N}\bigg)^{\alpha_N}, ~~ \alpha_N \sim 0.076, N_c \sim 8.8\times 10^{13} \<br>L(D) &#x3D; \bigg(\frac{D_c}{D}\bigg)^{\alpha_D},  ~~ \alpha_D \sim 0.095, D_c \sim 5.4\times 10^{13} \<br>L(C) &#x3D; \bigg(\frac{C_c}{C}\bigg)^{\alpha_C},  ~~ \alpha_C \sim 0.050, C_c \sim 3.1\times 10^{8}<br>$$</p>
</li>
<li><p>这三个规律是通过拟合模型在不同数据大小、模型大小和训练计算量下的性能得出的。结果表明，模型性能与这三个因素存在着强依赖关系。</p>
</li>
</ol>
<h4 id="1-3-2-Chinchilla-扩展法则"><a href="#1-3-2-Chinchilla-扩展法则" class="headerlink" title="1.3.2 Chinchilla 扩展法则"></a>1.3.2 Chinchilla 扩展法则</h4><ol>
<li>Hoffmann 等人（Google DeepMind 团队）提出了一种扩展法则的替代形式来指导大语言模型最优计算量的训练。</li>
<li>他们通过变化更大范围的模型大小和数据大小进行了严格的实验，并拟合了一个类似的扩展法则。</li>
</ol>
<blockquote>
<p><strong>KM扩展法则</strong>更偏向于将更大的预算分配给模型大小，而<strong>Chinchilla 扩展法则</strong>认为模型大小和数据大小应该以相同的比例增加。</p>
</blockquote>
<h3 id="1-4-大语言模型的涌现能力"><a href="#1-4-大语言模型的涌现能力" class="headerlink" title="1.4 大语言模型的涌现能力"></a>1.4 大语言模型的涌现能力</h3><blockquote>
<p>**大语言模型的涌现能力的定义：**在小型模型中不存在但在大型模型中产生的能力</p>
<p>**解释：**当规模达到一定水平时，性能显著提高，超出随机水平。</p>
</blockquote>
<p>大语言模型的三种典型涌现能力和具备这种能力的代表性模型：</p>
<ol>
<li>上下文学习，代表模型：GPT-3(175B的GPT-3模型在一般情况下表现出强大的上下文学习能力，但GPT-1和GPT-2模型则没有)</li>
<li>指令遵循(指令微调)，代表模型：LaMDA-PT(当模型大小达到68B时，经过指令微调的LaMDA-PT开始在未见过的任务上显著优于未微调的模型，但对于8B或更小的模型大小则不会如此。)</li>
<li>逐步推理，代表模型：PaLM(当思维链提示应用于模型大小大于60B的PaLM变体时，可以提高模型在算术推理基准任务上的性能，而当模型大小超过100B时，其相 对于标准提示的优势更加明显。)</li>
</ol>
<h3 id="1-5-大语言模型的关键技术"><a href="#1-5-大语言模型的关键技术" class="headerlink" title="1.5 大语言模型的关键技术"></a>1.5 大语言模型的关键技术</h3><p>大语言模型能够成功的几个可能的关键技术：</p>
<ul>
<li>扩展：Transformer语言模型存在明显的扩展效应：更大的模型，更大的数据规模和更多的训练计算通常会导致模型能力的提升。</li>
<li>训练：由于巨大的模型规模，成功训练一种能力强的大语言模型(LLM)是非常具有挑战性的。大语言模型(LLM)一般使用分布式训练，搭配一些正对行的优化策略来进行训练。</li>
<li>能力引导：大语言模型在一些特定的任务上表现并不是很好，但是可以通过一些手段引导大模型以激发这些能力。例如：通过设计合适的任务指令或具体的上下文学习(ICL)策略或者使用自然语言表达的任务描述对大语言模型(LLM)进行指令微调，以提高LLM在未见任务上的泛化能力。</li>
<li>对齐微调：由于大语言模型(LLM)被训练用来捕捉预训练语料库的数据特征，它们可能会为人类 生成有毒、偏见甚至有害的内容。因此，有必要使大语言模型(LLM)与 人类价值观保持一致，例如有用性、诚实性和无害性。</li>
<li>工具操作：大语言模型(LLM)在一些不适合以纯文本形式表达的任务上表现不佳，为了解决这些问题可以利用外部工具来弥补大语言模型(LLM)的不足。例如：GPT4已经能够使用外部插件来扩展它的能力范围。</li>
</ul>
<h3 id="1-6-大模型的发展史"><a href="#1-6-大模型的发展史" class="headerlink" title="1.6 大模型的发展史"></a>1.6 大模型的发展史</h3><h4 id="1-6-1-各种大模型的发布时间"><a href="#1-6-1-各种大模型的发布时间" class="headerlink" title="1.6.1 各种大模型的发布时间"></a>1.6.1 各种大模型的发布时间</h4><img src="https://cdn.jsdelivr.net/gh/David-deng-01/images/blog/LLMs-0623-final.png" alt="LLMs-0623-final" style="zoom:100%;" />

<blockquote>
<p>最近几年现有的大语言模型(参数量超过10B)发布的时间。黄色标记的是具有公开可用模型检查点的大语言模型。</p>
</blockquote>
<h4 id="1-6-2-GPT-系列模型的发展"><a href="#1-6-2-GPT-系列模型的发展" class="headerlink" title="1.6.2 GPT 系列模型的发展"></a>1.6.2 GPT 系列模型的发展</h4><img src="https://cdn.jsdelivr.net/gh/David-deng-01/images/blog/GPT%20%E7%B3%BB%E5%88%97%E6%A8%A1%E5%9E%8B%E7%9A%84%E6%BC%94%E5%8F%98.png" alt="GPT 系列模型的演变" style="zoom:100%;" />

<blockquote>
<p>GPT 系列模型的发展史：</p>
<ul>
<li>GPT-1于2018年6月发布，是一个只有解码器架构的生成预训练语言模型。</li>
<li>GPT-2于2019年2月发布，是一个无监督多任务学习器模型。拥有4个模型版本：分别是Small(117M)，Medium(345M)，Large(762M)，Extra Large(1542M)</li>
<li>GPT-3于2020年5月发布，是一个拥有上下文学习能力的大语言模型。拥有8个模型版本，最小的参数规模为125M，最大的参数规模为175B</li>
<li>Codex于2020年7月发布，是GPT-3添加了代码生成的能力，在代码的数据集上进行了预训练。</li>
<li>GPT-3.5于2022年3月发布，是一个综合的大语言模型，拥有对话生成，文本阅读和理解等功能。参数规模为175B。</li>
<li>GPT-4于2023年3月发布，是一个多模态大模型，拥有较强的综合推理能力。</li>
</ul>
</blockquote>
<h4 id="1-6-3-LLaMA-系列模型的发展"><a href="#1-6-3-LLaMA-系列模型的发展" class="headerlink" title="1.6.3 LLaMA 系列模型的发展"></a>1.6.3 LLaMA 系列模型的发展</h4><h5 id="两张可爱的关于-LLaMA-的漫画："><a href="#两张可爱的关于-LLaMA-的漫画：" class="headerlink" title="两张可爱的关于 LLaMA 的漫画："></a><strong>两张可爱的关于 LLaMA 的漫画：</strong></h5><div align='center' style='display: flex; justify-content: center; align-items: center;'><img src="https://cdn.jsdelivr.net/gh/David-deng-01/images/blog/a-ai-art-of-llama-family-v0-rpr9pe916xvb1.png" alt="a-ai-art-of-llama-family-v0-rpr9pe916xvb1" style="zoom:45%; margin-right: 2%" /><img src="https://cdn.jsdelivr.net/gh/David-deng-01/images/blog/es99xs58xxvb1.jpeg" alt="CDN media" style="zoom:45%; margin-left: 2%" /></div>

<h5 id="LLaMA-的发展和演变："><a href="#LLaMA-的发展和演变：" class="headerlink" title="LLaMA 的发展和演变："></a><strong>LLaMA 的发展和演变：</strong></h5><img src="https://cdn.jsdelivr.net/gh/David-deng-01/images/blog/LLaMA.png" alt="LLaMA" style="zoom:100%;" />

<blockquote>
<p>图中展示的是<code>LLaMA</code>的演变过程。</p>
<ul>
<li>虚线部分部分代表使用其他数据<code>继续预训练</code>，实线部分代表<code>指令微调</code></li>
<li>图中没有背景颜色的表示 <code>有效参数微调</code>，黄色背景颜色的表示 <code>全参数微调</code></li>
<li>从图中可以看出，<code>LLaMA</code> 有三个主要的分支，分别是：<ol>
<li>使用中文数据预训练后的 <code>Chinese LLaMA</code></li>
<li>使用合成数据进行微调的 <code>Alpaca</code></li>
<li>使用对话数据进行微调的 <code>Vicuna</code></li>
</ol>
</li>
<li>在这三个主要的分支的基础上，又发展出了许多其他的模型，涵盖了数学，金融，医疗，法律，双语，教育等方面。其中有很多我们耳熟能详的模型，例如：<code>MiniGPT-4</code>、<code>PandaGPT</code>、<code>TaoLi</code>等</li>
</ul>
</blockquote>
<h2 id="2-大语言模型四个主要方面"><a href="#2-大语言模型四个主要方面" class="headerlink" title="2 大语言模型四个主要方面"></a>2 大语言模型四个主要方面</h2><h3 id="2-1-预训练"><a href="#2-1-预训练" class="headerlink" title="2.1 预训练"></a>2.1 预训练</h3><h4 id="2-1-1-为什么要进行预训练？"><a href="#2-1-1-为什么要进行预训练？" class="headerlink" title="2.1.1 为什么要进行预训练？"></a>2.1.1 为什么要进行预训练？</h4><blockquote>
<p>预培训奠定了大语言模型(LLM)能力的基础。通过对大规模语料库的预训练，大语言模型(LLM)可以获得基本的语言理解和生成技能。预训练语料库的规模和质量是大语言模型(LLM)获得强大功能的关键。此外，为了有效地预训练大语言模型(LLM)，模型架构，加速方法和优化技术需要很好地设计。</p>
</blockquote>
<h4 id="2-1-2-预训练包括哪些过程？"><a href="#2-1-2-预训练包括哪些过程？" class="headerlink" title="2.1.2 预训练包括哪些过程？"></a>2.1.2 预训练包括哪些过程？</h4><blockquote>
<p>预训练通常包括：数据的收集和处理，模型的结构设计和模型训练的优化技术三个部分。不同的模型预训练可能存在一些特殊的情况，但是普遍的情况都会包含上述的三个部分。</p>
</blockquote>
<h5 id="数据预处理"><a href="#数据预处理" class="headerlink" title="数据预处理"></a><strong>数据预处理</strong></h5><img src="https://cdn.jsdelivr.net/gh/David-deng-01/images/blog/image-20231024142708636.png" alt="image-20231024142708636" style="zoom:2=100%;" />

<blockquote>
<p>上图中显示的是数据预处理的过程：</p>
<ol>
<li>原始语料(Raw Corpus)输入到质量过滤器(Quality Filtering)进行一些过滤操作，例如：语言过滤，度量过滤，统计过滤，关键词过滤等。</li>
<li>过滤后的数据输入到重复过滤器中，去除句子，文档，集合中的重复部分。</li>
<li>去重之后的数据输入到隐私过滤器中，检测并去除一些包含个人隐私的信息。</li>
<li>去除隐私后的数据输入到分词其中，得到词汇表和数据对应的向量表示。</li>
</ol>
</blockquote>
<h5 id="模型训练时各种数据来源的占比"><a href="#模型训练时各种数据来源的占比" class="headerlink" title="模型训练时各种数据来源的占比"></a><strong>模型训练时各种数据来源的占比</strong></h5><img src="https://cdn.jsdelivr.net/gh/David-deng-01/images/blog/%E5%A4%A7%E8%AF%AD%E8%A8%80%E6%A8%A1%E5%9E%8B%E9%A2%84%E8%AE%AD%E7%BB%83%E8%BF%99%E7%A7%8D%E5%90%84%E7%A7%8D%E6%95%B0%E6%8D%AE%E7%9A%84%E5%8D%A0%E6%AF%94.png" alt="大语言模型预训练这种各种数据的占比" style="zoom:100%;" />

<blockquote>
<p>上图中现实的是在预训练过程中各种数据来源的占比.</p>
<ul>
<li>图中可以看出，大部分大语言模型(LLM)都或多或少的使用了网页，书籍和新闻等数据</li>
<li>一些特殊的大语言模型(LLM)可能有针对性的使用了一些数据集进行训练或者微调</li>
</ul>
</blockquote>
<h5 id="主流的大语言模型架构"><a href="#主流的大语言模型架构" class="headerlink" title="主流的大语言模型架构"></a><strong>主流的大语言模型架构</strong></h5><img src="https://cdn.jsdelivr.net/gh/David-deng-01/images/blog/image-20231024144043317.png" alt="image-20231024144043317" style="zoom:100%;" />

<blockquote>
<p>主流的大语言模型的架构如上图所示，分别是：因果解码器架构，前缀解码器架构和编码器-解码器架构。</p>
<p>图中蓝色的方块表示前缀token之间的注意力，绿色的方块表示前缀token和目标token之间的注意力，黄色的方块表示目标token之间的注意力，灰色的方块表示掩码注意力。</p>
</blockquote>
<ol>
<li><strong>Encoder-Decoder架构</strong>：  普通的 Transformer 模型建立在<code>Encoder-Decoder</code>架构之上，它由两个 Transformer 块组成，分别作为编码器和解码器。编码器采用堆叠的多头自注意层对输入序列进行编码以生成其潜在表示，而解码器对这些表示进行交叉注意并自回归生成目标序列。<code>Encoder-Decoder</code>架构的预训练语言模型（例如 T5  和 BART）已在各种 NLP 任务中显示出有效性。到目前为止，只有少量的大语言模型是基于<code>Encoder-Decoder</code>架构构建的，例如 Flan-T5。</li>
<li><strong>Causal Decoder架构</strong>：<code>Causal Decoder</code>架构结合了单向注意掩码，以保证每个输入<code>token</code>只能关注过去的<code>token</code>和它自己。输入和输出<code>token</code>通过解码器以相同的方式处理。作为该架构的代表性语言模型，GPT 系列模型是基于因果解码器架构开发的。特别是，GPT-3 已经成功地证明了这种架构的有效性，也展示了大语言模型惊人的上下文学习能力。有趣的是，GPT-1 和 GPT-2 并没有表现出像 GPT-3 那样优越的能力，而且似乎缩放在增加该模型架构的模型容量方面起着重要作用。到目前为止，Causal Decoder已被各种现有的大语言模型广泛采用作为 大语言模型的体系结构，例如 OPT、BLOOM和 Gopher。请注意，接下来讨论的因果解码器和前缀解码器都属于<code>Decoder-only architecture</code>架构。而在提到<code>Decoder-only architecture</code>时，除非特别说明，否则主要指的是现有文献中的<code>Causal decoder architecture</code>。</li>
<li><strong>Prefix Decoder架构</strong>（又名，非因果解码器）修改了<code>Causal Decoder</code>的掩码机制，以实现对前缀<code>token</code>的双向关注和仅对生成的<code>token</code>的单向关注。这样，与<code>Encoder-Decoder</code>架构一样，前缀解码器可以对前缀序列进行双向编码，并自回归地逐个预测输出<code>token</code>，其中在编码和解码期间共享相同的参数。与其从头开始预训练，一个实用的建议是不断训练因果解码器，然后将它们转换为前缀解码器以加速收敛，例如，U-PaLM 源自 PaLM。现有的基于前缀解码器的代表性 LLM 包括 ChatGLM和 U-PaLM。</li>
</ol>
<div align='center' style='display: flex; justify-content: center; align-items: center;'><img src="https://cdn.jsdelivr.net/gh/David-deng-01/images/blog/v2-0af88beb0cc280317e06b24c2582eb60_720w.webp" alt="img" style="zoom:50%; margin-right: 2%" /><img src="https://cdn.jsdelivr.net/gh/David-deng-01/images/blog/eac4b74543a982263a2cf99edf6977044890eb88.jpeg@f_auto" alt="img" style="zoom:50%; margin-left: 2%" /></div>

<h3 id="2-2-适配微调"><a href="#2-2-适配微调" class="headerlink" title="2.2 适配微调"></a>2.2 适配微调</h3><blockquote>
<p>大语言模型(LLM)可以获得解决各种任务的通用能力，但是研究表明，可以通过进一步的适配是的大模型能够更好的解决特定的问题。</p>
<p>综述中主要谈及了两种常见的大语言模型的微调方法，分别是指令微调(instruction tuning)和对齐微调(alignment tuning)</p>
</blockquote>
<h4 id="2-2-1-指令微调-instruction-tuning"><a href="#2-2-1-指令微调-instruction-tuning" class="headerlink" title="2.2.1 指令微调(instruction tuning)"></a>2.2.1 指令微调(instruction tuning)</h4><blockquote>
<p>指令微调是在自然语言格式的实例集合上微调预训练后的LLM的方法。这种发方法与有监督微调和多任务提示训练密切相关，旨在增强（或解锁） LLM的能力。</p>
</blockquote>
<h4 id="2-2-2-对齐微调-alignment-tuning"><a href="#2-2-2-对齐微调-alignment-tuning" class="headerlink" title="2.2.2 对齐微调(alignment tuning)"></a>2.2.2 对齐微调(alignment tuning)</h4><blockquote>
<p>对齐微调旨在将LLM的行为与人类的价值观或偏好对齐。减少大语言模型生成一些有毒，有攻击性，有种族或者性别歧视等多种有悖于人类价值观的结果。</p>
</blockquote>
<h3 id="2-3-使用"><a href="#2-3-使用" class="headerlink" title="2.3 使用"></a>2.3 使用</h3><blockquote>
<p>经过预训练或适配微调之后，使用LLM的主要方法是为解决各种任务设计适当的提示策略。本综述介绍了两种典型的提示策略，分别是：上下文学习(in-contextlearning, ICL)和思维链提示(chain-of-thought prompting)。</p>
</blockquote>
<h4 id="2-3-1-上下文学习-in-context-learning-ICL"><a href="#2-3-1-上下文学习-in-context-learning-ICL" class="headerlink" title="2.3.1 上下文学习(in-context learning, ICL)"></a>2.3.1 上下文学习(in-context learning, ICL)</h4><blockquote>
<p>上下文学习(ICL)是典型的提示方法是将任务描述以自然语言文本的形式表达的上下文学习。上下文学习(ICL)使用一种由任务描述和作为示范的几个任务样例构成的自然语言提示。</p>
<p>**我的理解：**简单来说，上下文学习(ICL)就是给一段上下文，然后给一个问题，让大模型回答这个问题。没有中间的推理过程，只是回答给出的问题。</p>
</blockquote>
<h4 id="2-3-2-思维链提示-chain-of-thought-prompting"><a href="#2-3-2-思维链提示-chain-of-thought-prompting" class="headerlink" title="2.3.2 思维链提示(chain-of-thought prompting)"></a>2.3.2 思维链提示(chain-of-thought prompting)</h4><blockquote>
<p>思维链提示(chain-of-thought prompting)可以通过将一系列中间推理步骤加入提示中来增强上下文学习(ICL)。</p>
<p>思维链(CoT)是一种改进的提示策略，旨在提高大语言模型(LLM)在复杂推理任务中的性能，例如算术推理，常识推理和符号推理。</p>
<p>不同于上下文学习(ICL)中仅使用输入输出对来构造提示，思维链(CoT)将可以导出最终输出的中间推理步骤纳入提示中。</p>
<p>**我的理解：**思维链提示(chain-of-thought prompting)是一种特殊的上下文学习(ICL)。思维链将中间的推理过程也用于提示，可以提高大语言模型在复杂推理任务中的性能。</p>
</blockquote>
<img src="https://cdn.jsdelivr.net/gh/David-deng-01/images/blog/%E4%B8%8A%E4%B8%8B%E6%96%87%E5%AD%A6%E4%B9%A0%E5%92%8C%E6%80%9D%E7%BB%B4%E9%93%BE%E6%8F%90%E7%A4%BA.png" alt="上下文学习和思维链提示" style="zoom:100%;" />

<blockquote>
<p>上下文学习（ICL）和思维链（CoT）提示的比较说明。ICL用自然语言描述、几个演示和一个测试查询来提示LLM，而CoT提示涉及提示中的一系列中间推理步骤。</p>
<ol>
<li>绿色部分表示任务的描述</li>
<li>浅蓝色部分表示问题的示例</li>
<li>蓝色部分表示中间推理的步骤(思维链)</li>
<li>黄色部分表示需要回答的问题</li>
</ol>
</blockquote>
<h3 id="2-4-能力评估"><a href="#2-4-能力评估" class="headerlink" title="2.4 能力评估"></a>2.4 能力评估</h3><blockquote>
<p>为了检验LLM的有效性和优越性，已有研究采用了大量的任 务和基准数据集来进行实证评估和分析。综述中介绍了大语言模型(LLM)在<code>语言生成</code>和<code>语言理解</code>方面的三种基本评估任务。</p>
</blockquote>
<h4 id="2-4-1-语言生成"><a href="#2-4-1-语言生成" class="headerlink" title="2.4.1 语言生成"></a>2.4.1 语言生成</h4><blockquote>
<p>现有语言生成的任务主要可以分为语言建模、 条件文本生成和代码合成任务。需要注意的是，代码合成不 是典型的自然语言处理任务，但可以直接地用（经过代码数 据训练的）LLM以类似自然语言文本生成的方法解决。</p>
<ol>
<li>语言建模：语言建模是大语言模型(LLM)的基本能力，旨在基于前一个 token 预测下一个 token，主要关注基本的语言理解和生 成能力。</li>
<li>条件文本生成：条件文本生成旨在基于给定的条件生成满足特定任务需求的文本， 通常包括机器翻译、文本摘要和问答系统等。</li>
<li>代码合成：除了生成高质量的自然语言外，现有的大语言模型(LLM)还表现出强大的生成形式语言的能力，尤其是满足特定条件的计算机程序，这种能力被称为代码合成。</li>
</ol>
</blockquote>
<h4 id="2-4-2-知识利用"><a href="#2-4-2-知识利用" class="headerlink" title="2.4.2 知识利用"></a>2.4.2 知识利用</h4><blockquote>
<p>知识利用是一种智能系统基于事实证据的支撑，完成知识密集型任务的重要能力（例如常识问题回答和事实补全）。具体而言，它要求大语言模型(LLM)适当地利用来自预训练语料库的丰富事实知识，或在必要的时候检索外部数据。现有的知识利用任务分为三种类型，即闭卷问答，开卷问答和知识补全。</p>
<ol>
<li>闭卷问答：闭卷问答任务测试大语言模型(LLM)从预训练语料库中习得的事实知识。大语言模型(LLM)只能基于给定的上下文回答问题， 而不能使用外部资源。</li>
<li>开卷问答：与闭卷问答不同，在开卷问答任务中，大语言模型大语言模型(LLM)可以从外部知识库或文档集合中提取有用的证据，然后基于提取的证据回答问题。</li>
<li>知识补全：在知识补全任务中，大语言模型(LLM)可以被视为一个知识库，补全或预测知识单元的缺失部分。这种任务可以探索和评估大语言模型(LLM)从预训 练数据中学习到的知识的数量和种类。</li>
</ol>
</blockquote>
<h4 id="2-4-3-复杂推理"><a href="#2-4-3-复杂推理" class="headerlink" title="2.4.3 复杂推理"></a>2.4.3 复杂推理</h4><blockquote>
<p>复杂推理是指理解和利用相关的证据或逻辑来推导结论或做出决策的能力。根据推理过程中涉及的逻辑和证据类型，我们考虑将现有的评估任务分为三个主要类别，即知识推理、符号推理和数学推理。</p>
<ol>
<li>知识推理：知识推理任务依赖于逻辑关系和事实知识的证据来回答给定的问题。</li>
<li>符号推理：符号推理任务主要关注于在形式化规则设定中操作符号以实现某些特定目标，且这些操作和规则可能在大语言模型(LLM)预训练期间从未被看到过。</li>
<li>数学推理：数学推理任务需要综合利用数学知识、逻辑和计算来解决问题或生成证明过程。</li>
</ol>
</blockquote>
<p>大语言模型(LLM)的基础评测任务和相应的代表性数据集</p>
<table>
  <tr>
    <th>Level</th>
    <th>Ability</th>
    <th>Task</th>
    <th>Dataset</th>
  </tr>
  <tr>
    <td rowspan="9">Basic</td>
    <td rowspan="3">Language Generation</td>
    <td>Language Modeling</td>
    <td>Penn Treebank, WikiText-103, the Pile, LAMBADA</td>
  </tr>
  <tr>
    <td>Conditional Text Generation</td>
    <td>
      WMT’14,16,19,20,21,22, Flores-101, DiaBLa, CNN/DailyMail, XSum, WikiLingua
      OpenDialKG
    </td>
  </tr>
  <tr>
    <td>Code Synthesis</td>
    <td>APPS, HumanEval, MBPP, CodeContest, MTPB, DS-1000, ODEX</td>
  </tr>
  <tr>
    <td rowspan="3">Knowledge Utilization</td>
    <td>Closed-Book QA</td>
    <td>
      Natural Questions, ARC, TruthfulQA, Web Questions, TriviaQA, PIQA,
      LC-quad2.0, GrailQA, KQApro, CWQ, MKQA, ScienceQA
    </td>
  </tr>
  <tr>
    <td>Open-Book QA</td>
    <td>
      Natural Questions, OpenBookQA, ARC, TriviaQA, Web Questions, MS MARCO,
      QASC, SQuAD, WikiMovies
    </td>
  </tr>
  <tr>
    <td>Knowledge Completion</td>
    <td>WikiFact, FB15k-237, Freebase, WN18RR, WordNet, LAMA, YAGO3-10,YAGO</td>
  </tr>
  <tr>
    <td rowspan="3">Complex Reasoning</td>
    <td>Knowledge Reasoning</td>
    <td>
      CSQA, StrategyQA, HotpotQA, ARC, BoolQ, PIQA, SIQA, HellaSwag, WinoGrande,
      COPA, OpenBookQA, ScienceQA, proScript, ProPara, ExplaGraphs, ProofWriter,
      EntailmentBank, ProOntoQA
    </td>
  </tr>
  <tr>
    <td>Symbolic Reasoning</td>
    <td>
      CoinFlip, ReverseList, LastLetter, Boolean Assignment, Parity, Colored
      Object, Penguins in a Table, Repeat Copy, Object Counting
    </td>
  </tr>
  <tr>
    <td>Mathematical Reasoning</td>
    <td>
      MATH, GSM8k, SVAMP, MultiArith, ASDiv, MathQA, AQUA-RAT, MAWPS, DROP,
      NaturalProofs, PISA, miniF2F, ProofNet
    </td>
  </tr>
  <tr>
    <td rowspan="11">Advanced</td>
    <td rowspan="3">Human Alignment</td>
    <td>Honestness</td>
    <td>TruthfulQA, HaluEval</td>
  </tr>
  <tr>
    <td>Helpfulness</td>
    <td>HH-RLHF</td>
  </tr>
  <tr>
    <td>Harmlessness</td>
    <td>HH-RLHF, Crows-Pairs WinoGender, RealToxicityPrompts</td>
  </tr>
  <tr>
    <td rowspan="3">Interaction with External Environment</td>
    <td>Household</td>
    <td>VirtualHome, BEHAVIOR, ALFRED ,ALFWorld</td>
  </tr>
  <tr>
    <td>Website Environment</td>
    <td>WebShop, Mind2Web</td>
  </tr>
  <tr>
    <td>Open World</td>
    <td>MineRL, MineDojo</td>
  </tr>
  <tr>
    <td rowspan="5">Tool Manipulation</td>
    <td>Search Engine</td>
    <td>HotpotQA, TriviaQA, Natural Questions</td>
  </tr>
  <tr>
    <td>Code Executor</td>
    <td>GSM8k, TabMWP, Date Understanding</td>
  </tr>
  <tr>
    <td>Calculator</td>
    <td>GSM8k, MATH, CARP</td>
  </tr>
  <tr>
    <td>Model Interface</td>
    <td>GPT4Tools, Gorilla</td>
  </tr>
  <tr>
    <td>Data Interface</td>
    <td>WebQSP, MetaQA, WTQ WikiSQL, TabFact, Spider</td>
  </tr>
</table>


<h2 id="3-总结与未来方向"><a href="#3-总结与未来方向" class="headerlink" title="3 总结与未来方向"></a>3 总结与未来方向</h2><h3 id="3-1-总结"><a href="#3-1-总结" class="headerlink" title="3.1 总结"></a>3.1 总结</h3><p>本文的主要贡献：</p>
<ol>
<li>本文综述了大语言模型(LLM)的最新进展，并介绍了大语言模型(LLM)的主要概念、研究成果以及理解和利用大语言模型(LLM)的技术。</li>
<li>本文主要介绍了大小超过10B的大语言模型，没有考虑早期的预训练语言模型，例如：Bert、GPT-2等。</li>
<li>本文讨论了大语言模型(LLM)的四个重要方面，分别是：预训练、适配微调、使用和能力评估。</li>
<li>本文总结了开发大语言模型(LLM)的可用资源，并讨论了实现大语言模型(LLM)的重要技术以便复现大语言模型(LLM)。</li>
<li>本文的目标是涵盖关于大语言模型(LLM)的最新文献，并为研究人员和工程师提供一份有关这个主题的优质参考资料。</li>
</ol>
<h3 id="3-2-未来方向"><a href="#3-2-未来方向" class="headerlink" title="3.2 未来方向"></a>3.2 未来方向</h3><p>本文最后在一下几个方面，介绍了大语言模型(LLM)的挑战和未来方向：</p>
<ol>
<li>理论和原理：<ul>
<li>挑战：<ul>
<li>大语言模型(LLM)的运行机制目前还不是非常的明朗，例如：大模型的涌现能力出现的原因。</li>
<li>大语言模型(LLM)如何通过非常大且深的神经网络分配、组织和利用信息。</li>
<li>理解、描述和解释大语言模型(LLM)的能力或行为的正式理论和原理仍然缺失。</li>
</ul>
</li>
<li>未来方向：<ul>
<li>对于大语言模型(LLM)的涌现能力的解释和研究。</li>
<li>对于大语言模型(LLM)对于信息的利用、分配、组织方式进行研究。</li>
<li>建立和完善理解、描述和解释大语言模型(LLM)的能力或行为的理论和原理。</li>
</ul>
</li>
</ul>
</li>
<li>模型架构：<ul>
<li>挑战：<ul>
<li>减少标准自注意力机制所带来的时间复杂度是一个实际应用时重要的考虑因素。</li>
<li><code>灾难性遗忘</code>一直是神经网络的长期挑战，其对大语言模型(LLM)也有负面影响。</li>
</ul>
</li>
<li>未来方向：<ul>
<li>研究如何构建大语言模型(LLM)中更高效的<code>Transformer</code>变体十分重要，例如 GPT-3 中已经使用了<code>稀疏注意力</code>。</li>
<li>考虑将现有架构扩展到更具灵活性的机制或模块，以有效支持数据更新和任务专用化。</li>
</ul>
</li>
</ul>
</li>
<li>模型训练：<ul>
<li>挑战：<ul>
<li>预训练强大的大语言模型(LLM)需要消耗巨大的算力，并且对<code>数据质量</code>和<code>训练技巧</code>要求很高。</li>
</ul>
</li>
<li>未来方向：<ul>
<li>开发更系统、经济的预训练方法以优化大语言模型(LLM)变得尤为重要，同时考虑到模型有效性、效率优化和训练稳定性等因素。</li>
</ul>
</li>
</ul>
</li>
<li>模型应用：<ul>
<li>挑战：<ul>
<li>由于在实际应用中微调的成本非常高，提示已成 为使用大语言模型(LLM)的主要方法，但是提示设计时需要大量人力。</li>
<li>一些复杂任务（例如形式证明和数值计算）需要特定的知识或逻辑规则，这些规则可能无法用自然语言很好地表达或通过示例演示。</li>
</ul>
</li>
<li>未来方向：<ul>
<li>研究如何自动生成有用且高校的提示以解决各种任务。</li>
<li>开发更具信息量和灵活性的任务格式化方法以进行提示非常重要。</li>
</ul>
</li>
</ul>
</li>
<li>安全与对齐：<ul>
<li>挑战：<ul>
<li>大语言模型(LLM)倾向于产生幻觉， 这些文本看似合理，但可能在事实上是错误的。例如：ChatGPT 刚发布的时候存在&quot;一本正经的胡说八道&quot;的情况。</li>
<li>现有的方法避免大语言模型(LLM)产生幻觉或者生成一些有毒，有害，有偏见的文本主要是讲人工纳入训练循环 来开发良好对齐的大语言模型(LLM)，并使用人类反馈强化学习(RLHF)。但是这严重依赖专业标注者的高质量人类反馈数据，这使得它在实践中难以适当实施。</li>
</ul>
</li>
<li>未来方向：<ul>
<li>研究如何避免大语言模型(LLM)产生幻觉或者生成一些有毒，有害，有偏见的文本。</li>
<li>有必要改进人类反馈强化学习(RLHF)框架以减少人类标注者的工作量，并寻求更高效的、具有保证数据质量的标注方法，例如LLM可以用于辅助标注工作。</li>
</ul>
</li>
</ul>
</li>
</ol>
<h2 id="4-关于大语言模型的最新文献综述"><a href="#4-关于大语言模型的最新文献综述" class="headerlink" title="4 关于大语言模型的最新文献综述"></a>4 关于大语言模型的最新文献综述</h2><h3 id="4-1-大语言模型列表："><a href="#4-1-大语言模型列表：" class="headerlink" title="4.1 大语言模型列表："></a>4.1 大语言模型列表：</h3><table class="tg">
<thead>
  <tr>
    <th class="tg-nrix" align="center" rowspan="2">分类</th>
    <th class="tg-baqh" align="center" rowspan="2">模型</th>
    <th class="tg-0lax" align="center" rowspan="2">发表时间</th>
    <th class="tg-baqh" align="center" rowspan="2">大小(B)</th>
    <th class="tg-0lax" align="center" rowspan="2">链接</th>
  </tr>
  <tr>
  </tr>
</thead>
<tbody>
  <tr>
    <td class="tg-nrix" align="center" rowspan="25">开源<br/>大模型</td>
    <td class="tg-baqh" align="center">T5</td>
    <td class="tg-0lax" align="center">2019/10</td>
    <td class="tg-baqh" align="center">11</td>
    <td class="tg-0lax" align="center"><a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/1910.10683">论文链接</a></td>
  </tr>
  <tr>
    <td class="tg-baqh" align="center">mT5</td>
    <td class="tg-0lax" align="center">2021/03</td>
    <td class="tg-baqh" align="center">13</td>
    <td class="tg-0lax" align="center"><a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2010.11934">论文链接</a></td>
  </tr>
  <tr>
    <td class="tg-baqh" align="center">PanGu-α</td>
    <td class="tg-0lax" align="center">2021/05</td>
    <td class="tg-baqh" align="center">13</td>
    <td class="tg-0lax" align="center"><a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2104.12369">论文链接</a></td>
  </tr>
  <tr>
    <td class="tg-baqh" align="center">CPM-2</td>
    <td class="tg-0lax" align="center">2021/05</td>
    <td class="tg-baqh" align="center">198</td>
    <td class="tg-0lax" align="center"><a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2106.10715">论文链接</a></td>
  </tr>
  <tr>
    <td class="tg-baqh" align="center">T0</td>
    <td class="tg-0lax" align="center">2021/10</td>
    <td class="tg-baqh" align="center">11</td>
    <td class="tg-0lax" align="center"><a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2110.08207">论文链接</a></td>
  </tr>
  <tr>
    <td class="tg-baqh" align="center">GPT-NeoX-20B</td>
    <td class="tg-0lax" align="center">2022/02</td>
    <td class="tg-baqh" align="center">20</td>
    <td class="tg-0lax" align="center"><a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2204.06745">论文链接</a></td>
  </tr>
  <tr>
    <td class="tg-baqh" align="center">CodeGen</td>
    <td class="tg-0lax" align="center">2022/03</td>
    <td class="tg-baqh" align="center">16</td>
    <td class="tg-0lax" align="center"><a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2203.13474">论文链接</a></td>
  </tr>
  <tr>
    <td class="tg-baqh" align="center">Tk-Instruct</td>
    <td class="tg-0lax" align="center">2022/04</td>
    <td class="tg-baqh" align="center" align="center">11</td>
    <td class="tg-0lax" align="center"><a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2204.07705">论文链接</a></td>
  </tr>
  <tr>
    <td class="tg-baqh" align="center">UL2</td>
    <td class="tg-0lax" align="center">2022/02</td>
    <td class="tg-baqh" align="center">20</td>
    <td class="tg-0lax" align="center"><a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2205.05131">论文链接</a></td>
  </tr>
  <tr>
    <td class="tg-baqh" align="center">OPT</td>
    <td class="tg-0lax" align="center">2022/05</td>
    <td class="tg-baqh" align="center">175</td>
    <td class="tg-0lax" align="center"><a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2205.01068">论文链接</a></td>
  </tr>
  <tr>
    <td class="tg-baqh" align="center">YaLM</td>
    <td class="tg-0lax" align="center">2022/06</td>
    <td class="tg-baqh" align="center">100</td>
    <td class="tg-0lax" align="center"><a target="_blank" rel="noopener external nofollow noreferrer" href="https://github.com/yandex/YaLM-100B">GitHub</a></td>
  </tr>
  <tr>
    <td class="tg-baqh" align="center">NLLB</td>
    <td class="tg-0lax" align="center">2022/07</td>
    <td class="tg-baqh" align="center">55</td>
    <td class="tg-0lax" align="center"><a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2207.04672">论文链接</a></td>
  </tr>
  <tr>
    <td class="tg-baqh" align="center">BLOOM</td>
    <td class="tg-0lax" align="center">2022/07</td>
    <td class="tg-baqh" align="center">176</td>
    <td class="tg-0lax" align="center"><a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2211.05100">论文链接</a></td>
  </tr>
  <tr>
    <td class="tg-baqh" align="center">GLM</td>
    <td class="tg-0lax" align="center">2022/08</td>
    <td class="tg-baqh" align="center">130</td>
    <td class="tg-0lax" align="center"><a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2210.02414">论文链接</a></td>
  </tr>
  <tr>
    <td class="tg-baqh" align="center">Flan-T5</td>
    <td class="tg-0lax" align="center">2022/10</td>
    <td class="tg-baqh" align="center">11</td>
    <td class="tg-0lax" align="center"><a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2210.11416">论文链接</a></td>
  </tr>
  <tr>
    <td class="tg-baqh" align="center">mT0</td>
    <td class="tg-0lax" align="center">2022/11</td>
    <td class="tg-baqh" align="center">13</td>
    <td class="tg-0lax" align="center"><a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2211.01786">论文链接</a></td>
  </tr>
  <tr>
    <td class="tg-baqh" align="center">Galatica</td>
    <td class="tg-0lax" align="center" align="center" align="center">2022/11</td>
    <td class="tg-baqh" align="center" align="center">120</td>
    <td class="tg-0lax" align="center"><a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2211.09085">论文链接</a></td>
  </tr>
  <tr>
    <td class="tg-baqh" align="center">BLOOMZ</td>
    <td class="tg-0lax" align="center">2022/11</td>
    <td class="tg-baqh" align="center">176</td>
    <td class="tg-0lax" align="center"><a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2211.01786">论文链接</a></td>
  </tr>
  <tr>
    <td class="tg-baqh" align="center">OPT-IML</td>
    <td class="tg-0lax" align="center">2022/12</td>
    <td class="tg-baqh" align="center">175</td>
    <td class="tg-0lax" align="center"><a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2212.12017">论文链接</a></td>
  </tr>
  <tr>
    <td class="tg-baqh" align="center">Pythia</td>
    <td class="tg-0lax" align="center">2023/01</td>
    <td class="tg-baqh" align="center">12</td>
    <td class="tg-0lax" align="center"><a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2304.01373">论文链接</a></td>
  </tr>
  <tr>
    <td class="tg-baqh" align="center">LLaMA</td>
    <td class="tg-0lax" align="center">2023/02</td>
    <td class="tg-baqh" align="center">65</td>
    <td class="tg-0lax" align="center"><a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2302.13971v1">论文链接</a></td>
  </tr>
  <tr>
    <td class="tg-baqh" align="center">Vicuna</td>
    <td class="tg-0lax" align="center">2023/03</td>
    <td class="tg-baqh" align="center">13</td>
    <td class="tg-0lax" align="center"><a target="_blank" rel="noopener external nofollow noreferrer" href="https://lmsys.org/blog/2023-03-30-vicuna/">Blog</a></td>
  </tr>
  <tr>
    <td class="tg-baqh" align="center">ChatGLM</td>
    <td class="tg-0lax" align="center">2023/03</td>
    <td class="tg-baqh" align="center">6</td>
    <td class="tg-0lax" align="center"><a target="_blank" rel="noopener external nofollow noreferrer" href="https://github.com/THUDM/ChatGLM-6B">GitHub</a></td>
  </tr>
  <tr>
    <td class="tg-baqh" align="center">CodeGeeX</td>
    <td class="tg-0lax" align="center">2023/03</td>
    <td class="tg-baqh" align="center">13</td>
    <td class="tg-0lax" align="center"><a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2303.17568">论文链接</a></td>
  </tr>
  <tr>
    <td class="tg-baqh" align="center">Koala</td>
    <td class="tg-0lax" align="center">2023/04</td>
    <td class="tg-baqh" align="center">13</td>
    <td class="tg-0lax" align="center"><a target="_blank" rel="noopener external nofollow noreferrer" href="https://bair.berkeley.edu/blog/2023/04/03/koala/">Blog</a></td>
  </tr>
  <tr>
    <td class="tg-nrix" align="center" rowspan="31">不开源<br />大模型</td>
    <td class="tg-baqh" align="center">GShard</td>
    <td class="tg-0lax" align="center">2020/01</td>
    <td class="tg-baqh" align="center" align="center">600</td>
    <td class="tg-0lax" align="center"><a target="_blank" rel="noopener external nofollow noreferrer" href="http://arxiv.org/abs/2006.16668v1">论文链接</a></td>
  </tr>
  <tr>
    <td class="tg-baqh" align="center">GPT-3</td>
    <td class="tg-0lax" align="center">2020/05</td>
    <td class="tg-baqh" align="center">175</td>
    <td class="tg-0lax" align="center"><a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2005.14165">论文链接</a></td>
  </tr>
  <tr>
    <td class="tg-baqh" align="center">LaMDA</td>
    <td class="tg-0lax" align="center">2021/05</td>
    <td class="tg-baqh" align="center">137</td>
    <td class="tg-0lax" align="center"><a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2201.08239">论文链接</a></td>
  </tr>
  <tr>
    <td class="tg-baqh" align="center">HyperCLOVA</td>
    <td class="tg-0lax" align="center">2021/06</td>
    <td class="tg-baqh" align="center">82</td>
    <td class="tg-0lax" align="center"><a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2109.04650">论文链接</a></td>
  </tr>
  <tr>
    <td class="tg-baqh" align="center">Codex</td>
    <td class="tg-0lax" align="center">2021/07</td>
    <td class="tg-baqh" align="center">12</td>
    <td class="tg-0lax" align="center"><a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2107.03374">论文链接</a></td>
  </tr>
  <tr>
    <td class="tg-baqh" align="center">ERNIE 3.0</td>
    <td class="tg-0lax" align="center" align="center">2021/07</td>
    <td class="tg-baqh" align="center">10</td>
    <td class="tg-0lax" align="center"><a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2107.02137">论文链接</a></td>
  </tr>
  <tr>
    <td class="tg-baqh" align="center">Jurassic-1</td>
    <td class="tg-0lax" align="center">2021/08</td>
    <td class="tg-baqh" align="center">178</td>
    <td class="tg-0lax" align="center"><a target="_blank" rel="noopener external nofollow noreferrer" href="https://assets.website-files.com/60fd4503684b466578c0d307/61138924626a6981ee09caf6_jurassic_tech_论文链接.pdf">论文链接</a></td>
  </tr>
  <tr>
    <td class="tg-baqh" align="center" align="center">FLAN</td>
    <td class="tg-0lax" align="center">2021/10</td>
    <td class="tg-baqh" align="center">137</td>
    <td class="tg-0lax" align="center"><a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2109.01652">论文链接</a></td>
  </tr>
  <tr>
    <td class="tg-baqh" align="center">MT-NLG</td>
    <td class="tg-0lax" align="center">2021/10</td>
    <td class="tg-baqh" align="center">530</td>
    <td class="tg-0lax" align="center"><a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2201.11990">论文链接</a></td>
  </tr>
  <tr>
    <td class="tg-baqh" align="center">Yuan 1.0</td>
    <td class="tg-0lax" align="center">2021/10</td>
    <td class="tg-baqh" align="center">245</td>
    <td class="tg-0lax" align="center"><a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2110.04725">论文链接</a></td>
  </tr>
  <tr>
    <td class="tg-baqh" align="center">Anthropic</td>
    <td class="tg-0lax" align="center">2021/12</td>
    <td class="tg-baqh" align="center">52</td>
    <td class="tg-0lax" align="center"><a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2112.00861">论文链接</a></td>
  </tr>
  <tr>
    <td class="tg-baqh" align="center">WebGPT</td>
    <td class="tg-0lax" align="center">2021/12</td>
    <td class="tg-baqh" align="center">175</td>
    <td class="tg-0lax" align="center"><a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2112.09332">论文链接</a></td>
  </tr>
  <tr>
    <td class="tg-baqh" align="center">Gopher</td>
    <td class="tg-0lax" align="center">2021/12</td>
    <td class="tg-baqh" align="center">280</td>
    <td class="tg-0lax" align="center"><a target="_blank" rel="noopener external nofollow noreferrer" href="http://arxiv.org/abs/2112.11446v2">论文链接</a></td>
  </tr>
  <tr>
    <td class="tg-baqh" align="center">ERNIE 3.0 Titan</td>
    <td class="tg-0lax" align="center">2021/12</td>
    <td class="tg-baqh" align="center">260</td>
    <td class="tg-0lax" align="center"><a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2112.12731">论文链接</a></td>
  </tr>
  <tr>
    <td class="tg-baqh" align="center">GLaM</td>
    <td class="tg-0lax" align="center">2021/12</td>
    <td class="tg-baqh" align="center">1200</td>
    <td class="tg-0lax" align="center"><a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2112.06905">论文链接</a></td>
  </tr>
  <tr>
    <td class="tg-baqh" align="center">InstructGPT</td>
    <td class="tg-0lax" align="center">2022/01</td>
    <td class="tg-baqh" align="center">175</td>
    <td class="tg-0lax" align="center"><a target="_blank" rel="noopener external nofollow noreferrer" href="http://arxiv.org/abs/2203.02155v1">论文链接</a></td>
  </tr>
  <tr>
    <td class="tg-baqh" align="center">AlphaCode</td>
    <td class="tg-0lax" align="center">2022/02</td>
    <td class="tg-baqh" align="center">41</td>
    <td class="tg-0lax" align="center"><a target="_blank" rel="noopener external nofollow noreferrer" href="http://arxiv.org/abs/2203.07814v1">论文链接</a></td>
  </tr>
  <tr>
    <td class="tg-baqh" align="center">Chinchilla</td>
    <td class="tg-0lax" align="center">2022/03</td>
    <td class="tg-baqh" align="center">70</td>
    <td class="tg-0lax" align="center"><a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2203.15556">论文链接</a></td>
  </tr>
  <tr>
    <td class="tg-baqh" align="center">PaLM</td>
    <td class="tg-0lax" align="center">2022/04</td>
    <td class="tg-baqh" align="center">540</td>
    <td class="tg-0lax" align="center"><a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2204.02311">论文链接</a></td>
    <tr>
    <td class="tg-baqh" align="center">Cohere</td>
    <td class="tg-0lax" align="center">2022/06</td>
    <td class="tg-baqh" align="center">54</td>
    <td class="tg-0lax" align="center"><a target="_blank" rel="noopener external nofollow noreferrer" href="https://cohere.ai/">Homepage</a></td>
  </tr>
  <tr>
    <td class="tg-baqh" align="center">AlexaTM</td>
    <td class="tg-0lax" align="center">2022/08</td>
    <td class="tg-baqh" align="center">20</td>
    <td class="tg-0lax" align="center"><a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2208.01448">论文链接</a></td>
  </tr>
  <tr>
    <td class="tg-baqh" align="center">Luminous</td>
    <td class="tg-0lax" align="center">2022/09</td>
    <td class="tg-baqh" align="center">70</td>
    <td class="tg-0lax" align="center"><a target="_blank" rel="noopener external nofollow noreferrer" href="https://docs.aleph-alpha.com/docs/introduction/luminous/">Docs</a></td>
  </tr>
  <tr>
    <td class="tg-baqh" align="center">Sparrow</td>
    <td class="tg-0lax" align="center">2022/09</td>
    <td class="tg-baqh" align="center">70</td>
    <td class="tg-0lax" align="center"><a target="_blank" rel="noopener external nofollow noreferrer" href="http://arxiv.org/abs/2209.14375v1">论文链接</a></td>
  </tr>
  <tr>
    <td class="tg-baqh" align="center">WeLM</td>
    <td class="tg-0lax" align="center">2022/09</td>
    <td class="tg-baqh" align="center">10</td>
    <td class="tg-0lax" align="center"><a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2209.10372">论文链接</a></td>
  </tr>
  <tr>
    <td class="tg-baqh" align="center">U-PaLM</td>
    <td class="tg-0lax" align="center">2022/10</td>
    <td class="tg-baqh" align="center">540</td>
    <td class="tg-0lax" align="center"><a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2210.11399">论文链接</a></td>
  </tr>
  <tr>
    <td class="tg-baqh" align="center">Flan-PaLM</td>
    <td class="tg-0lax" align="center">2022/10</td>
    <td class="tg-baqh" align="center" align="center">540</td>
    <td class="tg-0lax" align="center"><a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2210.11416">论文链接</a></td>
  </tr>
  <tr>
    <td class="tg-baqh" align="center">Flan-U-PaLM</td>
    <td class="tg-0lax" align="center">2022/10</td>
    <td class="tg-baqh" align="center">540</td>
    <td class="tg-0lax" align="center"><a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2210.11416">论文链接</a></td>
  </tr>
  <tr>
    <td class="tg-baqh" align="center">Alpaca</td>
    <td class="tg-0lax" align="center">2023/03</td>
    <td class="tg-baqh" align="center">7</td>
    <td class="tg-0lax" align="center"><a target="_blank" rel="noopener external nofollow noreferrer" href="https://crfm.stanford.edu/2023/03/13/alpaca.html">Blog</a></td>
  </tr>
  <tr>
    <td class="tg-baqh" align="center">GPT-4</td>
    <td class="tg-0lax" align="center">2023/3</td>
    <td class="tg-baqh" align="center">-</td>
    <td class="tg-0lax" align="center"><a target="_blank" rel="noopener external nofollow noreferrer" href="http://arxiv.org/abs/2303.08774v2">论文链接</a></td>
  </tr>
  <tr>
    <td class="tg-baqh" align="center">PanGU-Σ</td>
    <td class="tg-0lax" align="center">2023/3</td>
    <td class="tg-baqh" align="center">1085</td>
    <td class="tg-0lax" align="center"><a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2303.10845">论文链接</a></td>
  </tr>
</tbody>
</table>


<h3 id="4-2-大模型相关论文"><a href="#4-2-大模型相关论文" class="headerlink" title="4.2 大模型相关论文"></a>4.2 大模型相关论文</h3><h4 id="4-2-1-开源大模型论文："><a href="#4-2-1-开源大模型论文：" class="headerlink" title="4.2.1 开源大模型论文："></a>4.2.1 开源大模型论文：</h4><ol>
<li><u>T5</u>: <strong>&quot;Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer&quot;</strong>. <em>Colin Raffel et al.</em> JMLR 2019. [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/1910.10683">Paper</a>] [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://huggingface.co/t5-11b">Checkpoint</a>]</li>
<li><u>mT5</u>: <strong>&quot;mT5: A massively multilingual pre-trained text-to-text transformer&quot;</strong>. <em>Linting Xue</em> et al. NAACL 2021. [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2010.11934">Paper</a>] [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://huggingface.co/google/mt5-xxl/tree/main">Checkpoint</a>]</li>
<li><u>PanGu-α</u>: <strong>&quot;PanGu-α: Large-scale Autoregressive Pretrained Chinese Language Models with Auto-parallel Computation&quot;</strong>. <em>Wei Zeng et al.</em> arXiv 2021. [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2104.12369">Paper</a>] [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://openi.pcl.ac.cn/PCL-Platform.Intelligence/PanGu-Alpha">Checkpoint</a>]</li>
<li><u>CPM-2</u>: <strong>&quot;CPM-2: Large-scale Cost-effective Pre-trained Language Models&quot;</strong>. <em>Zhengyan Zhang et al.</em> arXiv 2021. [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2106.10715">Paper</a>] [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://github.com/TsinghuaAI/CPM">Checkpoint</a>]</li>
<li><u>T0</u>: <strong>&quot;Multitask Prompted Training Enables Zero-Shot Task Generalization&quot;</strong>. <em>Victor Sanh et al.</em> ICLR 2022. [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2110.08207">Paper</a>] [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://huggingface.co/bigscience/T0">Checkpoint</a>]</li>
<li><u>GPT-NeoX-20B</u>: <strong>&quot;GPT-NeoX-20B: An Open-Source Autoregressive Language Model&quot;</strong>. <em>Sid Black et al.</em> arXiv 2022. [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2204.06745">Paper</a>] [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://huggingface.co/EleutherAI/gpt-neox-20b/tree/main">Checkpoint</a>]</li>
<li><u>CodeGen</u>: <strong>&quot;CodeGen: An Open Large Language Model for Code with Multi-Turn Program Synthesis&quot;</strong>. <em>Erik Nijkamp et al.</em> arXiv 2022. [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2203.13474">Paper</a>] [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://huggingface.co/Salesforce/codegen-16B-nl">Checkpoint</a>]</li>
<li><u>Tk-Instruct</u>: <strong>&quot;Super-NaturalInstructions: Generalization via Declarative Instructions on 1600+ NLP Tasks&quot;</strong>. <em>Yizhong Wang et al.</em> EMNLP 2022. [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2204.07705">Paper</a>] [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://huggingface.co/allenai/tk-instruct-11b-def-pos">Checkpoint</a>]</li>
<li><u>UL2</u>: <strong>&quot;UL2: Unifying Language Learning Paradigms&quot;</strong>. <em>Yi Tay et al.</em> arXiv 2022. [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2205.05131">Paper</a>] [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://github.com/google-research/google-research/tree/master/ul2">Checkpoint</a>]</li>
<li><u>OPT</u>: <strong>&quot;OPT: Open Pre-trained Transformer Language Models&quot;</strong>. <em>Susan Zhang et al.</em> arXiv 2022. [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2205.01068">Paper</a>] [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://github.com/facebookresearch/metaseq/tree/main/projects/OPT">Checkpoint</a>]</li>
<li><u>NLLB</u>: <strong>&quot;No Language Left Behind: Scaling Human-Centered Machine Translation&quot;</strong>. <em>NLLB Team.</em> arXiv 2022. [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2207.04672">Paper</a>] [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://github.com/facebookresearch/fairseq/tree/nllb">Checkpoint</a>]</li>
<li><u>BLOOM</u>: <strong>&quot;BLOOM: A 176B-Parameter Open-Access Multilingual Language Model&quot;</strong>. <em>BigScience Workshop</em>. arXiv 2022. [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2211.05100">Paper</a>] [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://huggingface.co/bigscience/bloom">Checkpoint</a>]</li>
<li><u>GLM</u>: <strong>&quot;GLM-130B: An Open Bilingual Pre-trained Model&quot;</strong>. <em>Aohan Zeng et al.</em> arXiv 2022. [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2210.02414">Paper</a>] [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://github.com/THUDM/GLM-130B">Checkpoint</a>]</li>
<li><u>Flan-T5</u>: <strong>&quot;Scaling Instruction-Finetuned Language Models&quot;</strong>. <em>Hyung Won Chung et al.</em> arXiv 2022. [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2210.11416">Paper</a>] [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints">Checkpoint</a>]</li>
<li><u>mT0 &amp;&amp; BLOOMZ</u>: <strong>&quot;Crosslingual Generalization through Multitask Finetuning&quot;</strong>. <em>Niklas Muennighoff et al.</em> arXiv 2022. [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2211.01786">Paper</a>] [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://github.com/bigscience-workshop/xmtf">Checkpoint</a>]</li>
<li><u>Galactica</u>: <strong>&quot;Galactica: A Large Language Model for Science&quot;</strong>. <em>Ross Taylor et al.</em> arXiv 2022. [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2211.09085">Paper</a>] [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://huggingface.co/facebook/galactica-120b">Checkpoint</a>]</li>
<li><u>OPT-IML</u>: <strong>&quot;OPT-IML: Scaling Language Model Instruction Meta Learning through the Lens of Generalization&quot;</strong>. <em>Srinivasan et al.</em> . arXiv 2022. [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2212.12017">Paper</a>] [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://huggingface.co/facebook/opt-iml-30b">Checkpoint</a>]</li>
<li><u>CodeGeeX</u>: <strong>&quot;CodeGeeX: A Pre-Trained Model for Code Generation with Multilingual Evaluations on HumanEval-X&quot;</strong>. <em>Qinkai Zheng et al.</em> . arXiv 2023. [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2303.17568">Paper</a>] [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://github.com/THUDM/CodeGeeX">Checkpoint</a>]</li>
<li><u>Pythia</u>: <strong>&quot;Pythia: A Suite for Analyzing Large Language Models Across Training and Scaling&quot;</strong>. <em>Stella Biderman et al.</em> . arXiv 2023. [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2304.01373">Paper</a>] [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://github.com/EleutherAI/pythia">Checkpoint</a>]</li>
<li><u>LLaMA</u>: <strong>&quot;LLaMA: Open and Efficient Foundation Language Models&quot;</strong>. <em>Hugo Touvron et al.</em> arXiv 2023. [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2302.13971v1">Paper</a>] [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://github.com/facebookresearch/llama">Checkpoint</a>]</li>
</ol>
<h4 id="4-2-2-不开源大模型论文："><a href="#4-2-2-不开源大模型论文：" class="headerlink" title="4.2.2 不开源大模型论文："></a>4.2.2 不开源大模型论文：</h4><ol>
<li><u>GShard</u>: <strong>&quot;GShard: Scaling Giant Models with Conditional Computation and Automatic Sharding&quot;</strong>. <em>Dmitry Lepikhin et al.</em> ICLR 2021. [<a target="_blank" rel="noopener external nofollow noreferrer" href="http://arxiv.org/abs/2006.16668v1">Paper</a>]</li>
<li><u>GPT-3</u>: <strong>&quot;Language Models are Few-Shot Learners&quot;</strong>. <em>Tom B. Brown et al.</em> NeurIPS 2020. [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2005.14165">Paper</a>]</li>
<li><u>LaMDA</u>: <strong>&quot;LaMDA: Language Models for Dialog Applications&quot;</strong>. <em>Romal Thoppilan et al.</em> arXiv 2021. [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2201.08239">Paper</a>]</li>
<li><u>HyperCLOVA</u>: <strong>&quot;What Changes Can Large-scale Language Models Bring? Intensive Study on HyperCLOVA: Billions-scale Korean Generative Pretrained Transformers&quot;</strong>. <em>Boseop Kim et al.</em> EMNLP 2021. [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2109.04650">Paper</a>]</li>
<li><u>CodeX</u>: <strong>&quot;Evaluating Large Language Models Trained on Code&quot;</strong>. <em>Mark Chen et al.</em> arXiv 2021. [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2107.03374">Paper</a>]</li>
<li><u>ERNIE 3.0</u>: <strong>&quot;ERNIE 3.0: Large-scale Knowledge Enhanced Pre-training for Language Understanding and Generation&quot;</strong>. <em>Yu Sun et al.</em> arXiv 2021. [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2107.02137">Paper</a>]</li>
<li><u>Jurassic-1</u>: <strong>&quot;Jurassic-1: Technical details and evaluation&quot;</strong>. <em>Opher Lieber et al.</em> 2021. [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://assets.website-files.com/60fd4503684b466578c0d307/61138924626a6981ee09caf6_jurassic_tech_paper.pdf">Paper</a>]</li>
<li><u>FLAN</u>: <strong>&quot;Finetuned Language Models Are Zero-Shot Learners&quot;</strong>. <em>Jason Wei et al.</em> ICLR 2021. [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2109.01652">Paper</a>]</li>
<li><u>MT-NLG</u>: <strong>&quot;Using DeepSpeed and Megatron to Train Megatron-Turing NLG 530B, A Large-Scale Generative Language Model&quot;</strong>. <em>Shaden Smith et al.</em> arXiv 2021. [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2201.11990">Paper</a>]</li>
<li><u>Yuan 1.0</u>: <strong>&quot;Yuan 1.0: Large-Scale Pre-trained Language Model in Zero-Shot and Few-Shot Learning&quot;</strong>. <em>Shaohua Wu et al.</em> arXiv 2021. [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2110.04725">Paper</a>]</li>
<li><u>Anthropic</u>: <strong>&quot;A General Language Assistant as a Laboratory for Alignment&quot;</strong> . <em>Amanda Askell et al.</em> arXiv 2021. [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2112.00861">Paper</a>]</li>
<li><u>WebGPT</u>: <strong>&quot;WebGPT: Browser-assisted question-answering with human feedback&quot;</strong> . <em>Reiichiro Nakano et al.</em> arXiv 2021. [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2112.09332">Paper</a>]</li>
<li><u>Gopher</u>: <strong>&quot;Scaling Language Models: Methods, Analysis &amp; Insights from Training Gopher&quot;</strong>.  <em>Jack W. Rae et al.</em> arXiv 2021. [<a target="_blank" rel="noopener external nofollow noreferrer" href="http://arxiv.org/abs/2112.11446v2">Paper</a>]</li>
<li><u>ERNIE 3.0 Titan</u>: <strong>&quot;ERNIE 3.0 Titan: Exploring Larger-scale Knowledge Enhanced Pre-training for Language Understanding and Generation&quot;</strong>.  *Shuohuan Wang et al. *arXiv 2021. [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2112.12731">Paper</a>]</li>
<li><u>GLaM</u>: <strong>&quot;GLaM: Efficient Scaling of Language Models with Mixture-of-Experts&quot;</strong>. <em>Nan Du et al.</em> ICML 2022. [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2112.06905">Paper</a>]</li>
<li><u>InstructGPT</u>: <strong>&quot;Training language models to follow instructions with human feedback&quot;</strong>. <em>Long Ouyang et al.</em> arXiv 2022. [<a target="_blank" rel="noopener external nofollow noreferrer" href="http://arxiv.org/abs/2203.02155v1">Paper</a>]</li>
<li><u>AlphaCode</u>: <strong>&quot;Competition-Level Code Generation with AlphaCode&quot;</strong>. <em>Yujia Li et al.</em> arXiv 2022. [<a target="_blank" rel="noopener external nofollow noreferrer" href="http://arxiv.org/abs/2203.07814v1">Paper</a>]</li>
<li><u>Chinchilla</u>: <strong>&quot;Training Compute-Optimal Large Language Models&quot;</strong>. <em>Jordan Hoffmann et al.</em> arXiv. [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2203.15556">Paper</a>]</li>
<li><u>PaLM</u>: <strong>&quot;PaLM: Scaling Language Modeling with Pathways&quot;</strong>. <em>Aakanksha Chowdhery et al.</em> arXiv 2022. [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2204.02311">Paper</a>]</li>
<li><u>AlexaTM</u>: <strong>&quot;AlexaTM 20B: Few-Shot Learning Using a Large-Scale Multilingual Seq2Seq Model&quot;</strong>. <em>Saleh Soltan et al.</em> arXiv 2022. [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2208.01448">Paper</a>]</li>
<li><u>Sparrow</u>: <strong>&quot;Improving alignment of dialogue agents via targeted human judgements&quot;</strong>. <em>Amelia Glaese et al.</em> . arXiv 2022. [<a target="_blank" rel="noopener external nofollow noreferrer" href="http://arxiv.org/abs/2209.14375v1">Paper</a>]</li>
<li><u>WeLM</u>: <strong>&quot;WeLM: A Well-Read Pre-trained Language Model for Chinese&quot;</strong>. <em>Hui Su et al.</em> . arXiv 2022. [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2209.10372">Paper</a>]</li>
<li><u>U-PaLM</u>: <strong>&quot;Transcending Scaling Laws with 0.1% Extra Compute&quot;</strong>. <em>Yi Tay et al.</em> arXiv 2022. [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2210.11399">Paper</a>]</li>
<li><u>Flan-PaLM &amp;&amp; Flan-U-PaLM</u>: <strong>&quot;Scaling Instruction-Finetuned Language Models&quot;</strong>. <em>Hyung Won Chung et al.</em> arXiv. [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2210.11416">Paper</a>] </li>
<li><u>GPT-4</u>: <strong>&quot;GPT-4 Technical Report&quot;</strong>. <em>OpenAI</em>. arXiv 2023. [<a target="_blank" rel="noopener external nofollow noreferrer" href="http://arxiv.org/abs/2303.08774v2">Paper</a>]</li>
<li><u>PanGu-Σ</u>: <strong>&quot;PanGu-Σ: Towards Trillion Parameter Language Model with Sparse Heterogeneous Computing&quot;</strong>. <em>Xiaozhe Ren et al.</em> arXiv 2023. [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2303.10845">Paper</a>]</li>
</ol>
<h4 id="4-2-3-常用语料库"><a href="#4-2-3-常用语料库" class="headerlink" title="4.2.3 常用语料库"></a>4.2.3 常用语料库</h4><ol>
<li><u>BookCorpus</u>: <strong>&quot;Aligning Books and Movies: Towards Story-like Visual Explanations by Watching Movies and Reading Books&quot;</strong>. <em>Yukun Zhu et al.</em>  ICCV 2015. [<a target="_blank" rel="noopener external nofollow noreferrer" href="http://arxiv.org/abs/1506.06724v1">Paper</a>] [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://huggingface.co/datasets/bookcorpus">Source</a>]</li>
<li><u>Guntenburg</u>: [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://www.gutenberg.org/">Source</a>]</li>
<li><u>CommonCrawl</u>: [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://commoncrawl.org/">Source</a>]</li>
<li><u>C4</u>: <strong>&quot;Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer&quot;</strong>. <em>Colin Raffel et al.</em> JMLR 2019. [<a target="_blank" rel="noopener external nofollow noreferrer" href="http://arxiv.org/abs/1910.10683v3">Paper</a>] [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://www.tensorflow.org/datasets/catalog/c4">Source</a>]</li>
<li><u>CC-stories-R</u>: <strong>&quot;A Simple Method for Commonsense Reasoning&quot;</strong>. <em>Trieu H. Trinh el al.</em> arXiv 2018. [<a target="_blank" rel="noopener external nofollow noreferrer" href="http://arxiv.org/abs/1806.02847v2">Paper</a>] [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://huggingface.co/datasets/spacemanidol/cc-stories">Source</a>]</li>
<li><u>CC-NEWS</u>: <strong>&quot;RoBERTa: A Robustly Optimized BERT Pretraining Approach&quot;</strong>. <em>Yinhan Liu et al.</em> arXiv 2019. [<a target="_blank" rel="noopener external nofollow noreferrer" href="http://arxiv.org/abs/1907.11692v1">Paper</a>] [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://huggingface.co/datasets/cc_news">Source</a>]</li>
<li><u>REALNEWs</u>: <strong>&quot;Defending Against Neural Fake News&quot;</strong>. <em>Rowan Zellers et al.</em> NeurIPS 2019. [<a target="_blank" rel="noopener external nofollow noreferrer" href="http://arxiv.org/abs/1905.12616v3">Paper</a>] [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://github.com/rowanz/grover/tree/master/realnews">Source</a>]</li>
<li><u>OpenWebText</u>: [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://skylion007.github.io/OpenWebTextCorpus/">Source</a>]</li>
<li><u>Pushshift.io</u>: <strong>&quot;The Pushshift Reddit Dataset&quot;</strong>. <em>Jason Baumgartner et al</em>. AAAI 2020. [<a target="_blank" rel="noopener external nofollow noreferrer" href="http://arxiv.org/abs/2001.08435v1">Paper</a>] [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://files.pushshift.io/reddit/">Source</a>]</li>
<li><u>Wikipedia</u>: [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://dumps.wikimedia.org/">Source</a>]</li>
<li><u>BigQuery</u>:  [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://cloud.google.com/bigquery/public-data?hl=zh-cn">Source</a>]</li>
<li><u>The Pile</u>: <strong>&quot;The Pile: An 800GB Dataset of Diverse Text for Language Modeling&quot;</strong>. <em>Leo Gao et al</em>. arxiv 2021. [<a target="_blank" rel="noopener external nofollow noreferrer" href="http://arxiv.org/abs/2101.00027v1">Paper</a>] [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://pile.eleuther.ai/">Source</a>]</li>
<li><u>ROOTS</u>: <strong>&quot;The BigScience ROOTS Corpus: A 1.6TB Composite Multilingual Dataset&quot;</strong>. <em>Laurençon et al</em>. NeurIPS 2022 Datasets and Benchmarks Track. [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2303.03915">paper</a>]</li>
</ol>
<h4 id="4-2-4-深度学习框架"><a href="#4-2-4-深度学习框架" class="headerlink" title="4.2.4 深度学习框架"></a>4.2.4 深度学习框架</h4><ol>
<li><u>Pytorch</u>: <strong>&quot;PyTorch: An Imperative Style, High-Performance Deep Learning Library&quot;</strong>. <em>Adam Paszke el al.</em> NeurIPS 2019. [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/1912.01703">Paper</a>] [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://pytorch.org/">Source</a>]</li>
<li><u>TensorFlow</u>: <strong>&quot;TensorFlow: A system for large-scale machine learning&quot;</strong>. <em>Martín Abadi et al.</em> OSDI 2016. [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/1605.08695">Paper</a>] [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://www.tensorflow.org/">Source</a>] </li>
<li><u>MXNet</u>: <strong>&quot;MXNet: A Flexible and Efficient Machine Learning Library for Heterogeneous Distributed Systems&quot;</strong>. <em>Tianqi Chen et al.</em> arXiv 2015. [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/1512.01274">Paper</a>] [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://github.com/apache/mxnet">Source</a>] </li>
<li><u>PaddlePaddle</u>: <strong>&quot;PaddlePaddle: An Open-Source Deep Learning Platform from Industrial Practice&quot;</strong> . <em>Yanjun Ma et al.</em> Frontiers of Data and Domputing 2019.  [<a target="_blank" rel="noopener external nofollow noreferrer" href="http://www.jfdc.cnic.cn/EN/abstract/abstract2.shtml">Paper</a>] [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://github.com/PaddlePaddle/Paddle">Source</a>] </li>
<li><u>MindSpore</u>: <strong>&quot;Huawei MindSpore AI Development Framework&quot;</strong> . <em>Huawei Technologies Co., Ltd.</em> Artificial Intelligence Technology 2022. [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://link.springer.com/chapter/10.1007/978-981-19-2879-6_5">Paper</a>] [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://github.com/mindspore-ai/mindspore">Source</a>] </li>
<li><u>OneFlow</u>: <strong>&quot;OneFlow: Redesign the Distributed Deep Learning Framework from Scratch&quot;</strong> . <em>Jinhui Yuan et al.</em> arXiv 2021. [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2110.15032">Paper</a>] [<a target="_blank" rel="noopener external nofollow noreferrer" href="https://github.com/Oneflow-Inc/oneflow">Source</a>]</li>
</ol>
<hr>
<ol>
<li>原文地址：<a target="_blank" rel="noopener external nofollow noreferrer" href="https://arxiv.org/abs/2303.18223">A Survey of Large Language Models (arxiv.org)</a></li>
<li>知乎地址：<a target="_blank" rel="noopener external nofollow noreferrer" href="https://zhuanlan.zhihu.com/p/641376419">大模型综述升级啦 (zhihu.com)</a></li>
<li>Github link：<a target="_blank" rel="noopener external nofollow noreferrer" href="https://github.com/RUCAIBox/LLMSurvey">A Survey of Large Language Models(github.com)</a></li>
<li>中文版本：<a target="_blank" rel="noopener external nofollow noreferrer" href="https://github.com/RUCAIBox/LLMSurvey/blob/main/assets/LLM_Survey_Chinese.pdf">LLM_Survey_Chinese.pdf (github.com)</a></li>
</ol>
<p>如果觉得文章对你有帮助的话，欢迎引用文章的原文</p>
<figure class="highlight tex"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br></pre></td><td class="code"><pre><span class="line">@article&#123;LLMSurvey,</span><br><span class="line">    title=&#123;A Survey of Large Language Models&#125;,</span><br><span class="line">    author=&#123;Zhao, Wayne Xin and Zhou, Kun and Li, Junyi and Tang, Tianyi and Wang, Xiaolei and Hou, Yupeng and Min, Yingqian and Zhang, Beichen and Zhang, Junjie and Dong, Zican and Du, Yifan and Yang, Chen and Chen, Yushuo and Chen, Zhipeng and Jiang, Jinhao and Ren, Ruiyang and Li, Yifan and Tang, Xinyu and Liu, Zikang and Liu, Peiyu and Nie, Jian-Yun and Wen, Ji-Rong&#125;,</span><br><span class="line">    year=&#123;2023&#125;,</span><br><span class="line">    journal=&#123;arXiv preprint arXiv:2303.18223&#125;,</span><br><span class="line">    url=&#123;http://arxiv.org/abs/2303.18223&#125;</span><br><span class="line">&#125;</span><br></pre></td></tr></table></figure>

</article><div class="post-copyright"><div class="post-copyright__author"><span class="post-copyright-meta"><i class="fas fa-circle-user fa-fw"></i>文章作者: </span><span class="post-copyright-info"><a href="https://blog.david-deng.cn">David</a></span></div><div class="post-copyright__type"><span class="post-copyright-meta"><i class="fas fa-square-arrow-up-right fa-fw"></i>文章链接: </span><span class="post-copyright-info"><a href="https://blog.david-deng.cn/2023/10/25/1%202023-10-25%20%E7%BB%84%E4%BC%9A%E5%88%86%E4%BA%AB/">https://blog.david-deng.cn/2023/10/25/1 2023-10-25 组会分享/</a></span></div><div class="post-copyright__notice"><span class="post-copyright-meta"><i class="fas fa-circle-exclamation fa-fw"></i>版权声明: </span><span class="post-copyright-info">本博客所有文章除特别声明外，均采用 <a href="https://creativecommons.org/licenses/by-nc-sa/4.0/" rel="external nofollow noreferrer" target="_blank">CC BY-NC-SA 4.0</a> 许可协议。转载请注明来源 <a href="https://blog.david-deng.cn" target="_blank">David 的博客</a>！</span></div></div><div class="tag_share"><div class="post-meta__tag-list"><a class="post-meta__tags" href="/tags/AI/">AI</a><a class="post-meta__tags" href="/tags/NLP/">NLP</a></div><div class="post-share"><div class="social-share" data-image="https://jsd.012700.xyz/gh/jerryc127/CDN/img/material-2.png" data-sites="facebook,twitter,wechat,weibo,qq"></div><link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/butterfly-extsrc/sharejs/dist/css/share.min.css" media="print" onload="this.media='all'"><script src="https://cdn.jsdelivr.net/npm/butterfly-extsrc/sharejs/dist/js/social-share.min.js" defer></script></div></div><div class="post-reward"><div class="reward-button"><i class="fas fa-qrcode"></i>赞助</div><div class="reward-main"><ul class="reward-all"><li class="reward-item"><a href="https://cdn.jsdelivr.net/gh/David-deng-01/images/blog/WeChatPay.jpg" rel="external nofollow noreferrer" target="_blank"><img class="post-qr-code-img" src="https://cdn.jsdelivr.net/gh/David-deng-01/images/blog/WeChatPay.jpg" alt="微信"/></a><div class="post-qr-code-desc">微信</div></li><li class="reward-item"><a href="https://cdn.jsdelivr.net/gh/David-deng-01/images/blog/Alipay.jpg" rel="external nofollow noreferrer" target="_blank"><img class="post-qr-code-img" src="https://cdn.jsdelivr.net/gh/David-deng-01/images/blog/Alipay.jpg" alt="支付宝"/></a><div class="post-qr-code-desc">支付宝</div></li></ul></div></div><nav class="pagination-post" id="pagination"><a class="pagination-related" href="/2023/10/17/%E3%80%90%E6%95%99%E7%A8%8B%E3%80%91OverLeaf%20%E5%85%8D%E8%B4%B9%E6%B0%B8%E4%B9%85%E5%8D%87%E7%BA%A7%20Professional%20%E8%AE%A2%E9%98%85/" title="【教程】OverLeaf 免费永久升级 Professional 订阅"><img class="cover" src="https://jsd.012700.xyz/gh/jerryc127/CDN/img/material-9.png" onerror="onerror=null;src='/img/404.jpg'" alt="cover of previous post"><div class="info"><div class="info-1"><div class="info-item-1">上一篇</div><div class="info-item-2">【教程】OverLeaf 免费永久升级 Professional 订阅</div></div><div class="info-2"><div class="info-item-1">【教程】OverLeaf 免费永久升级 Professional 订阅在读博士或者硕士的同学，可能经常会需要用到LaTex和Overleaf来对论文进行编辑和排版。常见的LaTex有两种使用的方式，一种是下载到自己本地然后在本地进行编译运行，另一种是在线编译运行。这里需要讲到的是在线编译运行的方式使用LaTex。 Overleaf 是在线编译运行LaTex的代表软件之一。Overleaf因其界面友好、可实时修改评论，且支持多人在线协作而受到许多人的欢迎。它支持多种运行的环境，有非常多的论文模板可以选用比如像ACM的会议通常会提供这两种工具的模版，并且是完全免费的。 Overleaf的免费版本是不能解锁review模式等全部功能的。今天在这里分享一个实用小技巧，让大家可以免费获得Professional Plan，解锁全部功能！ OverLeaf 官网 中文官网：Overleaf, 在线LaTeX编辑器 英文官网：Overleaf, Online LaTeX Editor  IEEE Collabratec 官网 英文官网：IEEE...</div></div></div></a><a class="pagination-related" href="/2023/10/28/heapq-%E5%A0%86%E9%98%9F%E5%88%97%E7%AE%97%E6%B3%95/" title="heapq 堆队列算法"><img class="cover" src="https://jsd.012700.xyz/gh/jerryc127/CDN/img/material-7.png" onerror="onerror=null;src='/img/404.jpg'" alt="cover of next post"><div class="info text-right"><div class="info-1"><div class="info-item-1">下一篇</div><div class="info-item-2">heapq 堆队列算法</div></div><div class="info-2"><div class="info-item-1">Python heapq 堆队列算法模块解释说明源码：Lib&#x2F;heapq.py  这个模块实现了堆队列算法，即优先队列算法。 堆是一棵完全二叉树，其中每个节点的值都小于等于其各个子节点的值。这个使用数组的实现，索引从 0 开始，且对所有的 k 都有 heap[k] &lt;= heap[2*k+1] 和 heap[k] &lt;= heap[2*k+2]。比较时不存在的元素被认为是无限大。堆最有趣的特性在于最小的元素总是在根结点：heap[0]。 这个API与教材的堆算法实现有所不同，具体区别有两方面：（a）我们使用了从零开始的索引。这使得节点和其孩子节点索引之间的关系不太直观但更加适合，因为 Python 使用从零开始的索引。 （b）我们的 pop 方法返回最小的项而不是最大的项（这在教材中称为“最小堆”；而“最大堆”在教材中更为常见，因为它更适用于原地排序）。 基于这两方面，把堆看作原生的Python list也没什么奇怪的： heap[0] 表示最小的元素，同时 heap.sort() 维护了堆的不变性！ 要创建一个堆，可以新建一个空列表 []，或者用函数...</div></div></div></a></nav><div class="relatedPosts"><div class="headline"><i class="fas fa-thumbs-up fa-fw"></i><span>相关推荐</span></div><div class="relatedPosts-list"><a class="pagination-related" href="/2024/01/04/2024-01-05%20%E5%A4%9A%E6%A8%A1%E6%80%81%E6%A8%A1%E5%9E%8B%E7%BB%BC%E8%BF%B0/" title="多模态基础模型：从专家到通用助理"><img class="cover" src="https://jsd.012700.xyz/gh/jerryc127/CDN/img/material-7.png" alt="cover"><div class="info text-center"><div class="info-1"><div class="info-item-1"><i class="far fa-calendar-alt fa-fw"></i> 2024-01-04</div><div class="info-item-2">多模态基础模型：从专家到通用助理</div></div><div class="info-2"><div class="info-item-1">多模态大模型综述</div></div></div></a></div></div></div><div class="aside-content" id="aside-content"><div class="card-widget card-info text-center"><div class="avatar-img"><img src="/img/avatar.png" onerror="this.onerror=null;this.src='/img/loading.gif'" alt="avatar"/></div><div class="author-info-name">David</div><div class="author-info-description">Welcome to David's Blog</div><div class="site-data"><a href="/archives/"><div class="headline">文章</div><div class="length-num">27</div></a><a href="/tags/"><div class="headline">标签</div><div class="length-num">28</div></a><a href="/categories/"><div class="headline">分类</div><div class="length-num">28</div></a></div><a id="card-info-btn" target="_blank" rel="noopener external nofollow noreferrer" href="https://github.com/david-deng-01"><i class="fab fa-github"></i><span>Follow Me</span></a><div class="card-info-social-icons"><a class="social-icon" href="https://github.com/david-deng-01" rel="external nofollow noreferrer" target="_blank" title="Github"><i class="fab fa-github"></i></a><a class="social-icon" href="https://blog.csdn.net/David_0925" rel="external nofollow noreferrer" target="_blank" title="CSDN"><i class="fa fa-book-open"></i></a><a class="social-icon" href="tencent://AddContact/?fromId=45&amp;fromSubId=1&amp;subcmd=all&amp;uin=635647792&amp;website=www.oicqzone.com" rel="external nofollow noreferrer" target="_blank" title="QQ"><i class="fab fa-qq"></i></a><a class="social-icon" href="mailto:david-deng-0925@qq.com" rel="external nofollow noreferrer" target="_blank" title="Email"><i class="fas fa-envelope-open-text"></i></a></div></div><div class="sticky_layout"><div class="card-widget" id="card-toc"><div class="item-headline"><i class="fas fa-stream"></i><span>目录</span><span class="toc-percentage"></span></div><div class="toc-content"><ol class="toc"><li class="toc-item toc-level-1"><a class="toc-link" href="#%E5%A4%A7%E8%AF%AD%E8%A8%80%E6%A8%A1%E5%9E%8B%E7%BB%BC%E8%BF%B0A-Survey-of-Large-Language-Models"><span class="toc-number">1.</span> <span class="toc-text">大语言模型综述A Survey of Large Language Models</span></a><ol class="toc-child"><li class="toc-item toc-level-2"><a class="toc-link" href="#0-%E6%91%98%E8%A6%81-Abstract"><span class="toc-number">1.1.</span> <span class="toc-text">0 摘要 (Abstract)</span></a></li><li class="toc-item toc-level-2"><a class="toc-link" href="#1-%E5%A4%A7%E8%AF%AD%E8%A8%80%E6%A8%A1%E5%9E%8B%E8%BF%91%E5%B9%B4%E6%9D%A5%E7%9A%84%E5%8F%91%E5%B1%95"><span class="toc-number">1.2.</span> <span class="toc-text">1 大语言模型近年来的发展</span></a><ol class="toc-child"><li class="toc-item toc-level-3"><a class="toc-link" href="#1-1-%E5%A4%A7%E8%AF%AD%E8%A8%80%E6%A8%A1%E5%9E%8B%E7%9A%84%E8%83%8C%E6%99%AF"><span class="toc-number">1.2.1.</span> <span class="toc-text">1.1 大语言模型的背景</span></a><ol class="toc-child"><li class="toc-item toc-level-4"><a class="toc-link" href="#1-1-1-%E8%AF%AD%E8%A8%80%E5%BB%BA%E6%A8%A1%E7%9A%84%E5%9B%9B%E4%B8%AA%E4%B8%BB%E8%A6%81%E5%8F%91%E5%B1%95%E9%98%B6%E6%AE%B5%EF%BC%9A"><span class="toc-number">1.2.1.1.</span> <span class="toc-text">1.1.1 语言建模的四个主要发展阶段：</span></a></li><li class="toc-item toc-level-4"><a class="toc-link" href="#1-1-2-%E7%A0%94%E7%A9%B6%E7%83%AD%E5%BA%A6%E7%BB%9F%E8%AE%A1%EF%BC%9A"><span class="toc-number">1.2.1.2.</span> <span class="toc-text">1.1.2 研究热度统计：</span></a></li></ol></li><li class="toc-item toc-level-3"><a class="toc-link" href="#1-2-%E4%B8%BB%E8%A6%81%E5%8F%91%E7%8E%B0"><span class="toc-number">1.2.2.</span> <span class="toc-text">1.2 主要发现</span></a></li><li class="toc-item toc-level-3"><a class="toc-link" href="#1-3-%E5%A4%A7%E8%AF%AD%E8%A8%80%E6%A8%A1%E5%9E%8B%E7%9A%84%E6%89%A9%E5%B1%95%E6%B3%95%E5%88%99"><span class="toc-number">1.2.3.</span> <span class="toc-text">1.3 大语言模型的扩展法则</span></a><ol class="toc-child"><li class="toc-item toc-level-4"><a class="toc-link" href="#1-3-1-KM%E6%89%A9%E5%B1%95%E6%B3%95%E5%88%99"><span class="toc-number">1.2.3.1.</span> <span class="toc-text">1.3.1 KM扩展法则</span></a></li><li class="toc-item toc-level-4"><a class="toc-link" href="#1-3-2-Chinchilla-%E6%89%A9%E5%B1%95%E6%B3%95%E5%88%99"><span class="toc-number">1.2.3.2.</span> <span class="toc-text">1.3.2 Chinchilla 扩展法则</span></a></li></ol></li><li class="toc-item toc-level-3"><a class="toc-link" href="#1-4-%E5%A4%A7%E8%AF%AD%E8%A8%80%E6%A8%A1%E5%9E%8B%E7%9A%84%E6%B6%8C%E7%8E%B0%E8%83%BD%E5%8A%9B"><span class="toc-number">1.2.4.</span> <span class="toc-text">1.4 大语言模型的涌现能力</span></a></li><li class="toc-item toc-level-3"><a class="toc-link" href="#1-5-%E5%A4%A7%E8%AF%AD%E8%A8%80%E6%A8%A1%E5%9E%8B%E7%9A%84%E5%85%B3%E9%94%AE%E6%8A%80%E6%9C%AF"><span class="toc-number">1.2.5.</span> <span class="toc-text">1.5 大语言模型的关键技术</span></a></li><li class="toc-item toc-level-3"><a class="toc-link" href="#1-6-%E5%A4%A7%E6%A8%A1%E5%9E%8B%E7%9A%84%E5%8F%91%E5%B1%95%E5%8F%B2"><span class="toc-number">1.2.6.</span> <span class="toc-text">1.6 大模型的发展史</span></a><ol class="toc-child"><li class="toc-item toc-level-4"><a class="toc-link" href="#1-6-1-%E5%90%84%E7%A7%8D%E5%A4%A7%E6%A8%A1%E5%9E%8B%E7%9A%84%E5%8F%91%E5%B8%83%E6%97%B6%E9%97%B4"><span class="toc-number">1.2.6.1.</span> <span class="toc-text">1.6.1 各种大模型的发布时间</span></a></li><li class="toc-item toc-level-4"><a class="toc-link" href="#1-6-2-GPT-%E7%B3%BB%E5%88%97%E6%A8%A1%E5%9E%8B%E7%9A%84%E5%8F%91%E5%B1%95"><span class="toc-number">1.2.6.2.</span> <span class="toc-text">1.6.2 GPT 系列模型的发展</span></a></li><li class="toc-item toc-level-4"><a class="toc-link" href="#1-6-3-LLaMA-%E7%B3%BB%E5%88%97%E6%A8%A1%E5%9E%8B%E7%9A%84%E5%8F%91%E5%B1%95"><span class="toc-number">1.2.6.3.</span> <span class="toc-text">1.6.3 LLaMA 系列模型的发展</span></a><ol class="toc-child"><li class="toc-item toc-level-5"><a class="toc-link" href="#%E4%B8%A4%E5%BC%A0%E5%8F%AF%E7%88%B1%E7%9A%84%E5%85%B3%E4%BA%8E-LLaMA-%E7%9A%84%E6%BC%AB%E7%94%BB%EF%BC%9A"><span class="toc-number">1.2.6.3.1.</span> <span class="toc-text">两张可爱的关于 LLaMA 的漫画：</span></a></li><li class="toc-item toc-level-5"><a class="toc-link" href="#LLaMA-%E7%9A%84%E5%8F%91%E5%B1%95%E5%92%8C%E6%BC%94%E5%8F%98%EF%BC%9A"><span class="toc-number">1.2.6.3.2.</span> <span class="toc-text">LLaMA 的发展和演变：</span></a></li></ol></li></ol></li></ol></li><li class="toc-item toc-level-2"><a class="toc-link" href="#2-%E5%A4%A7%E8%AF%AD%E8%A8%80%E6%A8%A1%E5%9E%8B%E5%9B%9B%E4%B8%AA%E4%B8%BB%E8%A6%81%E6%96%B9%E9%9D%A2"><span class="toc-number">1.3.</span> <span class="toc-text">2 大语言模型四个主要方面</span></a><ol class="toc-child"><li class="toc-item toc-level-3"><a class="toc-link" href="#2-1-%E9%A2%84%E8%AE%AD%E7%BB%83"><span class="toc-number">1.3.1.</span> <span class="toc-text">2.1 预训练</span></a><ol class="toc-child"><li class="toc-item toc-level-4"><a class="toc-link" href="#2-1-1-%E4%B8%BA%E4%BB%80%E4%B9%88%E8%A6%81%E8%BF%9B%E8%A1%8C%E9%A2%84%E8%AE%AD%E7%BB%83%EF%BC%9F"><span class="toc-number">1.3.1.1.</span> <span class="toc-text">2.1.1 为什么要进行预训练？</span></a></li><li class="toc-item toc-level-4"><a class="toc-link" href="#2-1-2-%E9%A2%84%E8%AE%AD%E7%BB%83%E5%8C%85%E6%8B%AC%E5%93%AA%E4%BA%9B%E8%BF%87%E7%A8%8B%EF%BC%9F"><span class="toc-number">1.3.1.2.</span> <span class="toc-text">2.1.2 预训练包括哪些过程？</span></a><ol class="toc-child"><li class="toc-item toc-level-5"><a class="toc-link" href="#%E6%95%B0%E6%8D%AE%E9%A2%84%E5%A4%84%E7%90%86"><span class="toc-number">1.3.1.2.1.</span> <span class="toc-text">数据预处理</span></a></li><li class="toc-item toc-level-5"><a class="toc-link" href="#%E6%A8%A1%E5%9E%8B%E8%AE%AD%E7%BB%83%E6%97%B6%E5%90%84%E7%A7%8D%E6%95%B0%E6%8D%AE%E6%9D%A5%E6%BA%90%E7%9A%84%E5%8D%A0%E6%AF%94"><span class="toc-number">1.3.1.2.2.</span> <span class="toc-text">模型训练时各种数据来源的占比</span></a></li><li class="toc-item toc-level-5"><a class="toc-link" href="#%E4%B8%BB%E6%B5%81%E7%9A%84%E5%A4%A7%E8%AF%AD%E8%A8%80%E6%A8%A1%E5%9E%8B%E6%9E%B6%E6%9E%84"><span class="toc-number">1.3.1.2.3.</span> <span class="toc-text">主流的大语言模型架构</span></a></li></ol></li></ol></li><li class="toc-item toc-level-3"><a class="toc-link" href="#2-2-%E9%80%82%E9%85%8D%E5%BE%AE%E8%B0%83"><span class="toc-number">1.3.2.</span> <span class="toc-text">2.2 适配微调</span></a><ol class="toc-child"><li class="toc-item toc-level-4"><a class="toc-link" href="#2-2-1-%E6%8C%87%E4%BB%A4%E5%BE%AE%E8%B0%83-instruction-tuning"><span class="toc-number">1.3.2.1.</span> <span class="toc-text">2.2.1 指令微调(instruction tuning)</span></a></li><li class="toc-item toc-level-4"><a class="toc-link" href="#2-2-2-%E5%AF%B9%E9%BD%90%E5%BE%AE%E8%B0%83-alignment-tuning"><span class="toc-number">1.3.2.2.</span> <span class="toc-text">2.2.2 对齐微调(alignment tuning)</span></a></li></ol></li><li class="toc-item toc-level-3"><a class="toc-link" href="#2-3-%E4%BD%BF%E7%94%A8"><span class="toc-number">1.3.3.</span> <span class="toc-text">2.3 使用</span></a><ol class="toc-child"><li class="toc-item toc-level-4"><a class="toc-link" href="#2-3-1-%E4%B8%8A%E4%B8%8B%E6%96%87%E5%AD%A6%E4%B9%A0-in-context-learning-ICL"><span class="toc-number">1.3.3.1.</span> <span class="toc-text">2.3.1 上下文学习(in-context learning, ICL)</span></a></li><li class="toc-item toc-level-4"><a class="toc-link" href="#2-3-2-%E6%80%9D%E7%BB%B4%E9%93%BE%E6%8F%90%E7%A4%BA-chain-of-thought-prompting"><span class="toc-number">1.3.3.2.</span> <span class="toc-text">2.3.2 思维链提示(chain-of-thought prompting)</span></a></li></ol></li><li class="toc-item toc-level-3"><a class="toc-link" href="#2-4-%E8%83%BD%E5%8A%9B%E8%AF%84%E4%BC%B0"><span class="toc-number">1.3.4.</span> <span class="toc-text">2.4 能力评估</span></a><ol class="toc-child"><li class="toc-item toc-level-4"><a class="toc-link" href="#2-4-1-%E8%AF%AD%E8%A8%80%E7%94%9F%E6%88%90"><span class="toc-number">1.3.4.1.</span> <span class="toc-text">2.4.1 语言生成</span></a></li><li class="toc-item toc-level-4"><a class="toc-link" href="#2-4-2-%E7%9F%A5%E8%AF%86%E5%88%A9%E7%94%A8"><span class="toc-number">1.3.4.2.</span> <span class="toc-text">2.4.2 知识利用</span></a></li><li class="toc-item toc-level-4"><a class="toc-link" href="#2-4-3-%E5%A4%8D%E6%9D%82%E6%8E%A8%E7%90%86"><span class="toc-number">1.3.4.3.</span> <span class="toc-text">2.4.3 复杂推理</span></a></li></ol></li></ol></li><li class="toc-item toc-level-2"><a class="toc-link" href="#3-%E6%80%BB%E7%BB%93%E4%B8%8E%E6%9C%AA%E6%9D%A5%E6%96%B9%E5%90%91"><span class="toc-number">1.4.</span> <span class="toc-text">3 总结与未来方向</span></a><ol class="toc-child"><li class="toc-item toc-level-3"><a class="toc-link" href="#3-1-%E6%80%BB%E7%BB%93"><span class="toc-number">1.4.1.</span> <span class="toc-text">3.1 总结</span></a></li><li class="toc-item toc-level-3"><a class="toc-link" href="#3-2-%E6%9C%AA%E6%9D%A5%E6%96%B9%E5%90%91"><span class="toc-number">1.4.2.</span> <span class="toc-text">3.2 未来方向</span></a></li></ol></li><li class="toc-item toc-level-2"><a class="toc-link" href="#4-%E5%85%B3%E4%BA%8E%E5%A4%A7%E8%AF%AD%E8%A8%80%E6%A8%A1%E5%9E%8B%E7%9A%84%E6%9C%80%E6%96%B0%E6%96%87%E7%8C%AE%E7%BB%BC%E8%BF%B0"><span class="toc-number">1.5.</span> <span class="toc-text">4 关于大语言模型的最新文献综述</span></a><ol class="toc-child"><li class="toc-item toc-level-3"><a class="toc-link" href="#4-1-%E5%A4%A7%E8%AF%AD%E8%A8%80%E6%A8%A1%E5%9E%8B%E5%88%97%E8%A1%A8%EF%BC%9A"><span class="toc-number">1.5.1.</span> <span class="toc-text">4.1 大语言模型列表：</span></a></li><li class="toc-item toc-level-3"><a class="toc-link" href="#4-2-%E5%A4%A7%E6%A8%A1%E5%9E%8B%E7%9B%B8%E5%85%B3%E8%AE%BA%E6%96%87"><span class="toc-number">1.5.2.</span> <span class="toc-text">4.2 大模型相关论文</span></a><ol class="toc-child"><li class="toc-item toc-level-4"><a class="toc-link" href="#4-2-1-%E5%BC%80%E6%BA%90%E5%A4%A7%E6%A8%A1%E5%9E%8B%E8%AE%BA%E6%96%87%EF%BC%9A"><span class="toc-number">1.5.2.1.</span> <span class="toc-text">4.2.1 开源大模型论文：</span></a></li><li class="toc-item toc-level-4"><a class="toc-link" href="#4-2-2-%E4%B8%8D%E5%BC%80%E6%BA%90%E5%A4%A7%E6%A8%A1%E5%9E%8B%E8%AE%BA%E6%96%87%EF%BC%9A"><span class="toc-number">1.5.2.2.</span> <span class="toc-text">4.2.2 不开源大模型论文：</span></a></li><li class="toc-item toc-level-4"><a class="toc-link" href="#4-2-3-%E5%B8%B8%E7%94%A8%E8%AF%AD%E6%96%99%E5%BA%93"><span class="toc-number">1.5.2.3.</span> <span class="toc-text">4.2.3 常用语料库</span></a></li><li class="toc-item toc-level-4"><a class="toc-link" href="#4-2-4-%E6%B7%B1%E5%BA%A6%E5%AD%A6%E4%B9%A0%E6%A1%86%E6%9E%B6"><span class="toc-number">1.5.2.4.</span> <span class="toc-text">4.2.4 深度学习框架</span></a></li></ol></li></ol></li></ol></li></ol></div></div><div class="card-widget card-recent-post"><div class="item-headline"><i class="fas fa-history"></i><span>最新文章</span></div><div class="aside-list"><div class="aside-list-item"><a class="thumbnail" href="/2025/01/05/other-%E9%9A%8F%E7%AC%94-icarus-%E4%B8%BB%E9%A2%98%E5%AE%89%E8%A3%85/" title="Hexo 配置 Icarus 主题"><img src="https://jsd.012700.xyz/gh/jerryc127/CDN/img/material-4.png" onerror="this.onerror=null;this.src='/img/404.jpg'" alt="Hexo 配置 Icarus 主题"/></a><div class="content"><a class="title" href="/2025/01/05/other-%E9%9A%8F%E7%AC%94-icarus-%E4%B8%BB%E9%A2%98%E5%AE%89%E8%A3%85/" title="Hexo 配置 Icarus 主题">Hexo 配置 Icarus 主题</a><time datetime="2025-01-05T08:25:00.000Z" title="发表于 2025-01-05 16:25:00">2025-01-05</time></div></div><div class="aside-list-item"><a class="thumbnail" href="/2024/03/18/other-leetcode-%E3%80%902024-03-18-LeetCode-%E5%88%B7%E9%A2%98-400%E9%81%93%E6%89%93%E5%8D%A1%E3%80%91/" title="LeetCode刷题400道打卡"><img src="https://jsd.012700.xyz/gh/jerryc127/CDN/img/material-9.png" onerror="this.onerror=null;this.src='/img/404.jpg'" alt="LeetCode刷题400道打卡"/></a><div class="content"><a class="title" href="/2024/03/18/other-leetcode-%E3%80%902024-03-18-LeetCode-%E5%88%B7%E9%A2%98-400%E9%81%93%E6%89%93%E5%8D%A1%E3%80%91/" title="LeetCode刷题400道打卡">LeetCode刷题400道打卡</a><time datetime="2024-03-17T16:00:00.000Z" title="发表于 2024-03-18 00:00:00">2024-03-18</time></div></div><div class="aside-list-item"><a class="thumbnail" href="/2024/01/04/2024-01-05%20%E5%A4%9A%E6%A8%A1%E6%80%81%E6%A8%A1%E5%9E%8B%E7%BB%BC%E8%BF%B0/" title="多模态基础模型：从专家到通用助理"><img src="https://jsd.012700.xyz/gh/jerryc127/CDN/img/material-7.png" onerror="this.onerror=null;this.src='/img/404.jpg'" alt="多模态基础模型：从专家到通用助理"/></a><div class="content"><a class="title" href="/2024/01/04/2024-01-05%20%E5%A4%9A%E6%A8%A1%E6%80%81%E6%A8%A1%E5%9E%8B%E7%BB%BC%E8%BF%B0/" title="多模态基础模型：从专家到通用助理">多模态基础模型：从专家到通用助理</a><time datetime="2024-01-04T14:42:08.000Z" title="发表于 2024-01-04 22:42:08">2024-01-04</time></div></div><div class="aside-list-item"><a class="thumbnail" href="/2023/12/30/%E6%A0%A1%E5%9B%AD%E7%BD%91%E8%87%AA%E5%8A%A8%E7%99%BB%E5%BD%95%E8%84%9A%E6%9C%AC/" title="校园网自动登录脚本"><img src="https://jsd.012700.xyz/gh/jerryc127/CDN/img/material-9.png" onerror="this.onerror=null;this.src='/img/404.jpg'" alt="校园网自动登录脚本"/></a><div class="content"><a class="title" href="/2023/12/30/%E6%A0%A1%E5%9B%AD%E7%BD%91%E8%87%AA%E5%8A%A8%E7%99%BB%E5%BD%95%E8%84%9A%E6%9C%AC/" title="校园网自动登录脚本">校园网自动登录脚本</a><time datetime="2023-12-30T09:21:30.000Z" title="发表于 2023-12-30 17:21:30">2023-12-30</time></div></div><div class="aside-list-item"><a class="thumbnail" href="/2023/11/24/%E5%A4%9A%E6%A8%A1%E6%80%81%E8%AE%BD%E5%88%BA%E8%AF%86%E5%88%AB%E5%9F%BA%E7%BA%BF%E6%A8%A1%E5%9E%8B%E5%A4%8D%E7%8E%B0/" title="多模态讽刺识别基线模型复现"><img src="https://jsd.012700.xyz/gh/jerryc127/CDN/img/material-10.png" onerror="this.onerror=null;this.src='/img/404.jpg'" alt="多模态讽刺识别基线模型复现"/></a><div class="content"><a class="title" href="/2023/11/24/%E5%A4%9A%E6%A8%A1%E6%80%81%E8%AE%BD%E5%88%BA%E8%AF%86%E5%88%AB%E5%9F%BA%E7%BA%BF%E6%A8%A1%E5%9E%8B%E5%A4%8D%E7%8E%B0/" title="多模态讽刺识别基线模型复现">多模态讽刺识别基线模型复现</a><time datetime="2023-11-24T08:51:18.000Z" title="发表于 2023-11-24 16:51:18">2023-11-24</time></div></div></div></div></div></div></main><footer id="footer"><div id="footer-wrap"><div class="copyright">&copy;2022 - 2025 By David</div><div class="framework-info"><span>框架 </span><a target="_blank" rel="noopener external nofollow noreferrer" href="https://hexo.io">Hexo</a><span class="footer-separator">|</span><span>主题 </span><a target="_blank" rel="noopener external nofollow noreferrer" href="https://github.com/jerryc127/hexo-theme-butterfly">Butterfly</a></div><div class="footer_custom_text"><a href="http://www.beian.gov.cn/portal/registerSystemInfo" rel="external nofollow noreferrer" target="_blank"> <img style="vertical-align:middle; width:20px; " src="https://cdn.jsdelivr.net/gh/David-deng-01/images/blog/icp.png"> 赣公网安备36082302000135号</a> <a href="https://beian.miit.gov.cn/" rel="external nofollow noreferrer" id="beian"  target="_blank">赣ICP备2023013705号-1</a></div></div></footer></div><div id="rightside"><div id="rightside-config-hide"><button id="readmode" type="button" title="阅读模式"><i class="fas fa-book-open"></i></button><button id="darkmode" type="button" title="日间和夜间模式切换"><i class="fas fa-adjust"></i></button><button id="hide-aside-btn" type="button" title="单栏和双栏切换"><i class="fas fa-arrows-alt-h"></i></button></div><div id="rightside-config-show"><button id="rightside-config" type="button" title="设置"><i class="fas fa-cog fa-spin"></i></button><button class="close" id="mobile-toc-button" type="button" title="目录"><i class="fas fa-list-ul"></i></button><button id="translateLink" type="button" title="简繁转换">繁</button><button id="go-up" type="button" title="回到顶部"><span class="scroll-percent"></span><i class="fas fa-arrow-up"></i></button></div></div><div><script src="/js/utils.js"></script><script src="/js/main.js"></script><script src="/js/tw_cn.js"></script><script src="https://cdn.jsdelivr.net/npm/node-snackbar/dist/snackbar.min.js"></script><div class="js-pjax"></div><script id="canvas_nest" defer="defer" color="0,0,255" opacity="0.7" zIndex="-1" count="99" mobile="false" src="https://cdn.jsdelivr.net/npm/butterfly-extsrc/dist/canvas-nest.min.js"></script><script async data-pjax src="//busuanzi.ibruce.info/busuanzi/2.3/busuanzi.pure.mini.js"></script></div></body></html>